In [6]:
import json
from pathlib import Path

from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE

# Data prep

In [7]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

posts = []

for file in json_files:
    with open(file, 'r') as f:
        data = [json.loads(line) for line in f]
        posts.extend(data)

In [8]:
sentence_sentiment = ['Mowa nienawiści', 'Neutralny', 'Hate', 'Neutral']

In [9]:
def get_labels(text, labels):
    labeled_texts = []
    word_labels = [label for label in labels if label[2] not in sentence_sentiment]
    for label in word_labels:
        labeled_text = text[label[0]: label[1]]
        labeled_texts.append((labeled_text, label[2]))
    return labeled_texts

In [10]:
def get_tsne_results(all_words):
    emb_tensor = [word["embedding"] for word in all_words if word["embedding"] is not None]
    emb_tensor = np.stack(emb_tensor)

    tsne = TSNE()
    clustered = tsne.fit_transform(emb_tensor)

    df = pd.DataFrame(clustered)
    df["label"] = [word["name"] for word in all_words if word["embedding"] is not None]
    df["sentiment"] = [word["label"] for word in all_words if word["embedding"] is not None]

    return df

# Word embeddings - Word2vec

In [11]:
file_path = 'nkjp+wiki-forms-all-100-skipg-ns.txt.gz'

model = KeyedVectors.load_word2vec_format(file_path, binary=False)

KeyboardInterrupt: 

In [7]:
all_words = []

for post in posts:
    text = post["text"]
    labels = get_labels(post["text"], post["label"])
    words = text.split(" ")

    for word in words:
        new_word = {"name": word, "label": None, "embedding": None}
        if word in model:
            new_word["embedding"] = model[word]

        for label in labels:
            if word in label[0]:
                new_word["label"] = label[1]
        all_words.append(new_word)

In [8]:
df = get_tsne_results(all_words)

In [9]:
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()

# Word embeddings - FastText

In [12]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence

model = WordEmbeddings('pl')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
all_words = []

for post in posts:
    text = post["text"]
    labels = get_labels(post["text"], post["label"])
    sentence = Sentence(text)
    model.embed(sentence)
    
    for token in sentence:
        new_word = {"name": token.text, "label": None, "embedding": token.embedding}

        for label in labels:
            if token.text in label[0]:
                new_word["label"] = label[1]
        all_words.append(new_word)

In [14]:
df = get_tsne_results(all_words)
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()

# Sentence embeddings - TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
sentences = [line["text"] for line in posts]
tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(sentences)

In [17]:
sentence_sentiment = []

for post in posts:
    labels = [label[2] for label in post["label"]]
    if "Hate" in labels:
        sentence_sentiment.append("Hate")
    else:
        sentence_sentiment.append("Neutral")

In [18]:
tsne = TSNE()
clustered = tsne.fit_transform(vectorized.toarray())

df = pd.DataFrame(clustered)
df["label"] = [post["text"] for post in posts]
df["sentiment"] = sentence_sentiment

In [19]:
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()

# Sentence embeddings - BERT

In [20]:
from flair.embeddings import WordEmbeddings, TransformerDocumentEmbeddings, StackedEmbeddings
from flair.data import Sentence

stacked_embeddings = StackedEmbeddings([
    WordEmbeddings('pl'),
    TransformerDocumentEmbeddings('bert-base-multilingual-cased')
])

In [31]:
embeddings = []
for post in posts:
    sentence = Sentence(post["text"])

    stacked_embeddings.embed(sentence)
    embedding = sentence.get_embedding()
    embeddings.append(embedding)

embeddings = np.stack(embeddings)

In [33]:
sentence_sentiment = []

for post in posts:
    labels = [label[2] for label in post["label"]]
    if "Hate" in labels:
        sentence_sentiment.append("Hate")
    else:
        sentence_sentiment.append("Neutral")

In [34]:
tsne = TSNE()
clustered = tsne.fit_transform(embeddings)

df = pd.DataFrame(clustered)
df["label"] = [post["text"] for post in posts]
df["sentiment"] = sentence_sentiment

In [36]:
fig = px.scatter(df, x=0, y=1, hover_data=df.columns, color="sentiment")
fig.show()