# GEC Processing using SentenceTransformer


In [10]:
import numpy as np
import pandas as pd
import sys

from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer


In [2]:
df = pd.read_pickle("samples/articles.pkl")
print(df.shape)
df.head()


(6514, 2)


Unnamed: 0,title,text
0,Club Handbol Martorell,"Fundat l’any 1951, disposa d’equip sènior masc..."
1,Societat Bibliogràfica Valenciana,"N’és president Adolf Pizcueta, i en foren fund..."
2,Nikolaj Nikolajevič Bogol’ubov,Treballà al departament de física matemàtica d...
3,Glinskij,"Els germans Mikhail (? — 1534), Ivan i Vasilij..."
4,Hermann Lietz,"El 1898 fundà, amb C.Reddie, la primera escola..."


In [24]:
SAMPLE_SIZE = 200

df = df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)


In [None]:
model_name = "PlanTL-GOB-ES/roberta-base-ca"
model = SentenceTransformer(model_name)


In [49]:
# 6500 texts -> 10 min -> 121 MB

sentence_vecs = model.encode(df["text"])
print(f"{sys.getsizeof(sentence_vecs)/float(1<<20):,.2f} MB")


19.08 MB


In [60]:
sentence_vecs.shape


(6514, 768)

Exporting the raw data, to be visualized with tensorboard embedding projector https://projector.tensorflow.org/


In [1]:
# np.savetxt("samples/1_2_article_vects_200.tsv", sentence_vecs, delimiter="\t")
# np.savetxt("samples/2_2_article_vects_6k.tsv", sentence_vecs, delimiter="\t")

In [27]:
df["preview"] = df["text"].str.split(" ").str[:30:].str.join(" ")
df[["title", "preview"]].to_csv(
    "samples/1_1_article_titles_200.tsv", index=False, sep="\t"
)
# df[["title", "preview"]].to_csv("samples/2_1_article_titles_6k.tsv", index=False, sep="\t")


Processing and exporting data to be visualized with the custom made streamlit app


In [55]:
sentence_vecs = np.loadtxt("samples/2_2_article_vects_6k.tsv", delimiter="\t")
sentence_vecs.shape


(6514, 768)

In [56]:
pairs = []

for i in tqdm(range(len(sentence_vecs))):
    other_sentences = np.delete(sentence_vecs.copy(), i, 0)
    cos_scores = cosine_similarity([sentence_vecs[i]], other_sentences).flatten()
    # most_similar_index = cos_scores.argmax()
    # most_similar_index += 1 if (most_similar_index >= i) else 0

    # Get, sort and adjust the top 5 cosines scores
    top5 = np.argpartition(cos_scores, -5)[-5:]
    top5_sorted = top5[np.argsort(cos_scores[top5])[::-1]]
    top5_adj = [ind + 1 if ind >= i else ind for ind in top5_sorted]

    pairs.append((i, top5_adj, list(cos_scores[top5_sorted])))


  0%|          | 0/6514 [00:00<?, ?it/s]

In [17]:
import pickle


In [57]:
# with open("samples/1_3_article_pairs_200.pkl", "wb") as file:
#   pickle.dump(pairs, file)

# with open("samples/2_3_article_pairs_6k.pkl", "wb") as file:
#   pickle.dump(pairs, file)
