In [1]:
import json
from pathlib import Path

from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
import hdbscan
from sklearn.cluster import KMeans
import spacy
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from flair.embeddings import WordEmbeddings, TransformerDocumentEmbeddings, StackedEmbeddings
from flair.data import Sentence
import fasttext
import fasttext.util
warnings.filterwarnings("ignore")

In [8]:
nlp = spacy.load("pl_core_news_sm")
fasttext.util.download_model('pl', if_exists='ignore')

'cc.pl.300.bin'

In [2]:
def save_data(data, path):
    with open(path, "w") as f:
        json.dump(data, f)

In [30]:
def load_data(path):
    with open(path, "r") as f:
        return json.load(f)

In [None]:
def save_embeddings(all_words, path):
    embeddings = []
    for word in all_words:
        if word["embedding"] is not None:
            embeddings.append({"name": word["name"], "label": word["label"], "embedding": word["embedding"].tolist()})
    save_data(embeddings, path)

In [None]:
def load_embeddings(path):
    embeddings = load_data(path)
    for embedding in embeddings:
        embedding["embedding"] = np.array(embedding["embedding"])
    return embeddings

# Data prep

In [2]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

posts = []
for file in json_files:
    with open(file, 'r') as f:
        data = [json.loads(line) for line in f]
        posts.extend(data)

In [3]:
sentence_sentiment = ['Mowa nienawiści', 'Neutralny', 'Wzmacnianie', 'Osłabianie', 'Odwracanie']

In [4]:
def get_labels(text, labels):
    labeled_texts = []
    word_labels = [label for label in labels if label[2] in sentence_sentiment]
    for label in word_labels:
        labeled_text = text[label[0]: label[1]]
        labeled_texts.append((labeled_text, label[2]))
    return labeled_texts

In [6]:
def get_tsne_results(all_words):
    emb_tensor = [word["embedding"] for word in all_words if word["embedding"] is not None]
    emb_tensor = np.stack(emb_tensor)

    tsne = TSNE()
    clustered = tsne.fit_transform(emb_tensor)

    df = pd.DataFrame(clustered)
    df["label"] = [word["name"] for word in all_words if word["embedding"] is not None]
    df["sentiment"] = [word["label"] for word in all_words if word["embedding"] is not None]

    return df

In [5]:
def preprocess_text(nlp, text):
    text = BeautifulSoup(text).get_text()
    text = ' '.join(re.sub(r"(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text).split())
    text = ' '.join(re.sub(r"(\w+:\/\/\S+)", " ", text).split())
    text = ' '.join(re.sub(r"[\.\,\!\?\:\;\-\=\)]", " ", text).split())
    text = ' '.join(re.sub(r"_account", " ", text).split())
    text = text.lower()
    text = text.replace("’","'")
    text = text.replace('\\', '')
    text = nlp(text)
    text = ' '.join([word.lemma_ for word in text if not word.is_stop])
    return text

# Word embeddings - Word2vec

In [8]:
file_path = "nkjp+wiki-lemmas-all-300-cbow-hs.txt.gz"
model_w2v = KeyedVectors.load_word2vec_format(file_path, binary=False)

In [9]:
all_words_w2v = []

for post in posts:
    text = post["text"]
    text_clean = preprocess_text(nlp, text)
    labels = get_labels(text_clean, post["label"])
    words = text_clean.split(" ")

    for word in words:
        new_word = {"name": word, "label": None, "embedding": None}
        if word in model_w2v:
            new_word["embedding"] = model_w2v[word]

        for label in labels:
            if word in label[0]:
                new_word["label"] = label[1]
        all_words_w2v.append(new_word)

In [17]:
df_w2v = get_tsne_results(all_words_w2v)

In [18]:
fig = px.scatter(df_w2v, x=0, y=1, hover_data=df_w2v.columns, color="sentiment")
fig.show()

In [16]:
def k_similar_words(model: KeyedVectors, all_words, k):
    all_similar_words = []
    for word in all_words:
        if word["embedding"] is not None:
            similar_words = model.most_similar(word["name"], topn=k)
            all_similar_words.append({
                "word": word["name"],
                "similar_words": [(similar_word[0], similar_word[1]) for similar_word in similar_words],
                "label": word["label"]
            })
    return all_similar_words

In [21]:
w2v_similar_words = k_similar_words(model_w2v, all_words_w2v, 3)

# Word embeddings - FastText

In [8]:
ft = fasttext.load_model('cc.pl.300.bin')

In [10]:
all_words_ft = []

for post in posts:
    text = post["text"]
    clean_text = preprocess_text(nlp, text)
    labels = get_labels(clean_text, post["label"])
    words = clean_text.split(" ")

    for word in words:
        new_word = {"name": word, "label": None, "embedding": None}
        if word in ft:
            new_word["embedding"] = ft[word]

        for label in labels:
            if word in label[0]:
                new_word["label"] = label[1]
        all_words_ft.append(new_word)

In [14]:
df = get_tsne_results(all_words_ft)
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()

In [24]:
def k_nearest_words(model, all_words, k):
    all_nearest_words = []
    for word in all_words:
        if word["embedding"] is not None:
            nearest_words = model.get_nearest_neighbors(word["name"], k=k)
            all_nearest_words.append({
                "word": word["name"],
                "nearest_words": [(nearest_word[1], nearest_word[0]) for nearest_word in nearest_words],
                "label": word["label"]
            })
    return all_nearest_words

In [25]:
ft_similar_words = k_nearest_words(ft, all_words_ft, 3)

# Sentence embeddings - TF-IDF

In [9]:
sentences = [preprocess_text(nlp, post["text"]) for post in posts]
tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(sentences)

In [10]:
sentence_sentiment = []

for post in posts:
    labels = [label[2] for label in post["label"]]
    if "Mowa nienawiści" in labels:
        sentence_sentiment.append("Mowa nienawiści")
    else:
        sentence_sentiment.append("Neutralny")

In [11]:
tsne = TSNE()
clustered = tsne.fit_transform(vectorized.toarray())

df = pd.DataFrame(clustered)
df["label"] = [post["text"] for post in posts]
df["sentiment"] = sentence_sentiment

In [12]:
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()

In [25]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(vectorized)

In [26]:
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2)
hdbscan_labels = hdbscan_clusterer.fit_predict(vectorized.toarray())

In [27]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_results = tsne.fit_transform(vectorized.toarray())

In [28]:
df = pd.DataFrame(tsne_results, columns=[0, 1])
df['kmeans_cluster'] = kmeans_labels
df['hdbscan_cluster'] = hdbscan_labels
df['sentence'] = sentences
df['sentiment'] = sentence_sentiment

In [46]:
fig_kmeans = px.scatter(df, x=0, y=1, hover_data=['sentence'], color="kmeans_cluster")
fig_kmeans.show()

In [45]:
fig_hdbscan = px.scatter(df, x=0, y=1, hover_data=['sentence'], color="hdbscan_cluster")
fig_hdbscan.show()

# Sentence embeddings - BERT

In [31]:
stacked_embeddings = StackedEmbeddings([
    WordEmbeddings('pl'),
    TransformerDocumentEmbeddings('bert-base-multilingual-cased')
])

In [32]:
embeddings = []
for post in posts:
    sentence = Sentence(preprocess_text(nlp, post["text"]))

    stacked_embeddings.embed(sentence)
    embedding = sentence.get_embedding()
    embeddings.append(embedding)

embeddings = np.stack(embeddings)

In [33]:
sentence_sentiment = []

for post in posts:
    labels = [label[2] for label in post["label"]]
    if "Mowa nienawiści" in labels:
        sentence_sentiment.append("Mowa nienawiści")
    else:
        sentence_sentiment.append("Neutralny")

In [34]:
tsne = TSNE()
clustered = tsne.fit_transform(embeddings)

df = pd.DataFrame(clustered)
df["label"] = [post["text"] for post in posts]
df["sentiment"] = sentence_sentiment

In [35]:
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()

In [36]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(embeddings)

In [37]:
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2)
hdbscan_labels = hdbscan_clusterer.fit_predict(embeddings)

In [38]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_results = tsne.fit_transform(embeddings)

In [39]:
df = pd.DataFrame(tsne_results, columns=[0, 1])
df['kmeans_cluster'] = kmeans_labels
df['hdbscan_cluster'] = hdbscan_labels
df['sentence'] = sentences
df['sentiment'] = sentence_sentiment

In [44]:
fig_kmeans = px.scatter(df, x=0, y=1, hover_data=['sentence'], color="kmeans_cluster")
fig_kmeans.show()

In [42]:
fig_hdbscan = px.scatter(df, x=0, y=1, hover_data=['sentence'], color="hdbscan_cluster")
fig_hdbscan.show()