In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.express as px

import random
from collections import Counter
from nltk import tokenize
from unicodedata import normalize


## Importing the contents of the collections


In [2]:
from os import listdir

GEC_folder = "data/EnciclopediaCat"
csv_collections = [f"{GEC_folder}/{collection}" for collection in listdir(GEC_folder)]
print("Loaded", len(csv_collections), "collections")

id_to_collection = {
    id: collection.split("/")[2][:-4] for (id, collection) in enumerate(csv_collections)
}
collection_to_id = {
    collection.split("/")[2][:-4]: id for (id, collection) in enumerate(csv_collections)
}


Loaded 33 collections


In [3]:
# Output naming convention: parsed[ collection[ article( title, text ) ] ]
parsed = []
previous_lens = []
for collection in csv_collections:
    df = pd.read_csv(collection, sep=";", dtype="unicode")

    previous_lens.append(df.shape[0])
    df = df[["title", "Body"]].dropna()
    df = df[df["title"] != "Crèdits"]

    articles = list(zip(df["title"], df["Body"]))
    parsed.append(articles)


lens = [len(t) for t in parsed]
print("Number of articles on each  collection:", lens)
print()
print("Mean number of articles per collection: %.2f" % np.mean(lens))
print("Std desv of articles per collection: %.2f" % np.std(lens))


Number of articles on each  collection: [2549, 45, 277, 150, 26, 439, 133, 138, 8380, 1110, 46, 16, 12183, 678, 1987, 13524, 347, 109875, 498, 1889, 38, 138, 1851, 871, 3712, 18, 98, 9673, 145, 178, 122, 52, 0]

Mean number of articles per collection: 5187.45
Std desv of articles per collection: 18840.71


In [4]:
df = pd.DataFrame(zip(previous_lens, lens)).rename(
    index=id_to_collection, columns={0: "Raw", 1: "Filtered"}
)
df["nans"] = df["Raw"] - df["Filtered"]
fig = px.bar(
    df, barmode="group", log_y=True, labels={"index": "Collection", "value": "Articles"}
)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    legend_title="", legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)
)

avg = df["Filtered"].mean()
fig.add_shape(
    type="line",
    x0=-0.5,
    x1=len(lens),
    y0=avg,
    y1=avg,
    line=dict(color="Red",),
    xref="x",
    yref="y",
)
# fig.show()

plotly.offline.plot(fig, filename="graphs/GEC_EDA_collection_articles.html")


'graphs/GEC_EDA_collection_articles.html'

## Data analysis


In [5]:
"""Number of words per article"""
texts_words = pd.Series(
    [len(text.split(" ")) for collection in parsed for (_, text) in collection]
)
print(texts_words.describe())

fig = px.histogram(texts_words, labels={"value": "Words per article"}, log_y=True)
avg = np.mean(texts_words)
fig.add_shape(
    type="line",
    x0=-0.5,
    x1=max(texts_words),
    y0=avg,
    y1=avg,
    line=dict(color="Red",),
    xref="x",
    yref="y",
)
fig.update_layout(showlegend=False, font_size=15)
# fig.show()

plotly.offline.plot(fig, filename="graphs/GEC_EDA_words_per_article.html")


count    171186.000000
mean        341.500765
std        1388.989449
min           1.000000
25%          34.000000
50%          78.000000
75%         178.000000
max       61221.000000
dtype: float64


'graphs/GEC_EDA_words_per_article.html'

In [6]:
"""Texts must be normalized because some contain some badly formatted chars"""
"""Also, some replacements must be made in order to adapt it for the tokenizer"""


def preprocess_text(text):
    text = (
        normalize("NFKC", text)
        .replace("\t", "")
        .replace("\n\n", "\n")
        .replace("L’", "L'")
        .replace("l’", "l'")
        .replace("S’", "S'")
        .replace("s’", "s'")
        .replace("D’", "D'")
        .replace("d’", "d'")
        .replace("N’", "N'")
        .replace("n’", "n'")
        .replace("e’", "e'")
    )
    return text


In [7]:
# 1 min
tokenized = [
    (i, title, tokenize.sent_tokenize(preprocess_text(text)))
    for (i, articles) in enumerate(parsed)
    for (title, text) in articles
]

print(len(tokenized))


171186


In [8]:
accepted = []
rejected = []

for (collection, title, sentences) in tokenized:
    for sentence in sentences:
        if sentence.count("\n"):
            rejected.append((collection, title, sentence))
        else:
            accepted.append((collection, title, sentence))

print(len(accepted) + len(rejected))
print(len(accepted))
print(len(rejected))


2286826
2125106
161720


In [9]:
accepted_f = Counter([collection for (collection, _, _) in accepted])
rejected_f = Counter([collection for (collection, _, _) in rejected])


df = pd.DataFrame(
    zip(accepted_f.values(), rejected_f.values(), lens), accepted_f.keys()
).rename(index=id_to_collection, columns={0: "Accepted", 1: "Rejected", 2: "Articles"})

# df["diff"] = abs(df["Accepted"] - df["Rejected"])
fig = px.bar(
    df,
    barmode="group",
    log_y=True,
    labels={"index": "Collection", "value": "Sentences"},
)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    legend_title="", legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)
)

avg = df["Accepted"].mean()
fig.add_shape(
    type="line",
    x0=-0.5,
    x1=len(lens),
    y0=avg,
    y1=avg,
    line=dict(color="Red",),
    xref="x",
    yref="y",
)
# fig.show()

plotly.offline.plot(fig, filename="graphs/GEC_EDA_sentences_per_collection.html")


'graphs/GEC_EDA_sentences_per_collection.html'

## Length separation


In [10]:
accepted_lens = [
    (collection, title, sentence, len(sentence.split(" ")))
    for (collection, title, sentence) in accepted
]
sentence_words = pd.Series([s[3] for s in accepted_lens])
print(sentence_words.describe().apply(lambda x: format(x, "f")))


count    2125106.000000
mean          25.175754
std           17.586038
min            1.000000
25%           13.000000
50%           22.000000
75%           33.000000
max          903.000000
dtype: object


In [11]:
fig = px.histogram(sentence_words, labels={"value": "Words per sentence"}, log_y=True)
avg = np.mean(sentence_words)
fig.add_shape(
    type="line",
    x0=-0.5,
    x1=max(sentence_words),
    y0=avg,
    y1=avg,
    line=dict(color="Red",),
    xref="x",
    yref="y",
)
fig.update_layout(showlegend=False, font_size=15)
# fig.show()

plotly.offline.plot(fig, filename="graphs/GEC_EDA_words_per_sentence.html")


'graphs/GEC_EDA_words_per_sentence.html'

In [12]:
len(accepted)


2125106

In [13]:
NUM_SAMPLES = 10000

LOWER_THRESHOLD = 13
UPPER_THRESHOLD = 33

accepted_range = [
    s for s in accepted_lens if s[3] >= LOWER_THRESHOLD and s[3] <= UPPER_THRESHOLD
]
len_lower_range = len([s for s in accepted_lens if s[3] < LOWER_THRESHOLD])
len_upper_range = len(accepted) - len(accepted_range) - len_lower_range

print("Short\t", len_lower_range)
print("Mid\t", len(accepted_range))
print("Long\t", len_upper_range)


Short	 475889
Mid	 1134569
Long	 514648


In [14]:
print(len(accepted_range) - len_lower_range - len_upper_range)
print(len(accepted_range) / len(accepted_lens))
print((len(accepted_range) - len_lower_range - len_upper_range) / len(accepted_lens))


144032
0.5338881919301908
0.06777638386038155


In [15]:
# Old version
# df_red.to_pickle("samples/articles.pkl")


In [16]:
random.seed(42)
final_sentences = random.sample(accepted_range, NUM_SAMPLES * 2)
print(len(final_sentences))
final_sentences = final_sentences[NUM_SAMPLES:]
print(len(final_sentences))
# with open("samples/3_3_sentences_20k.pkl", "wb") as f:
#     pickle.dump(final_sentences, f)


20000
10000
