In [13]:
from sentence_transformers import SentenceTransformer
import json
import pandas as pd
from sklearn.mixture import GaussianMixture as GMM
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [14]:
with open('../data/cleaned_comments.json') as comments_file:
    comments = json.load(comments_file)

In [15]:
# convert comments into a dataframe
df = pd.DataFrame({'text': comments})

In [16]:
# Downloads the transformer to your device. After the first time running will load it from device
model = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
# convert text into embeddings
embeddings = df['text'].map(lambda x: model.encode(x))

In [18]:
# converts embeddings into a dataframe
embeddings = embeddings.to_frame()
embeddings = embeddings['text'].apply(pd.Series)

In [19]:
embeddings_df = pd.DataFrame(embeddings)

In [20]:
# fit the GMM model
gmm = GMM(n_components=10, covariance_type='diag', max_iter=2000, random_state=0).fit(embeddings_df)
# Predict the labels for the data samples
labels = gmm.predict(embeddings_df)

In [21]:
# Save data frame with labels
df['label'] = labels
df.to_csv('../data/clustered_comments.csv', index=False)

In [22]:
# Show data frame with labels
df

Unnamed: 0,text,label
0,i normally dont stay places long but i will tr...,2
1,i really need help the police force every wher...,2
2,crooked judge just wants trump to pay for the ...,0
3,this fool senile judge will look cute in his o...,7
4,where are the victims they are willing to dest...,9
...,...,...
3896,thats why trump wants laura trump to have a pl...,1
3897,please donate to a homeless bone spurs veteran,6
3898,such bs and 1 sided ruling none of this will s...,2
3899,horrid week for republicans lovin it,3
