In [91]:
from sentence_transformers import SentenceTransformer
import csv
import pandas as pd
from sklearn.mixture import GaussianMixture as GMM
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [92]:
with open('../comments/comments2/CNN-Full Speech: President Biden’s 2024 State of the Union address_liked.csv', 'r') as f:
    df = pd.read_csv(f)
df

Unnamed: 0,comment,likes_count,stance_llama_8b
0,you cant love your country only when you win,1627,LIBERAL
1,you cant only love your country when you win t...,1050,LIBERAL
2,the number of times mike johnson nodded in agr...,702,OTHER
3,dont pay congress until they do their job,650,CONSERVATIVE
4,unions build middle class not billionaires ind...,543,LIBERAL
...,...,...,...
2870,they pumped him full of dementia medication so...,1,CONSERVATIVE
2871,well that was certainly revealing if i ever do...,1,LIBERAL
2872,i wouldnt be mentioning let alone boasting abo...,1,LIBERAL
2873,,1,OTHER


In [93]:
# Downloads the transformer to your device. After the first time running will load it from device
model = SentenceTransformer('all-MiniLM-L6-v2')

In [94]:
# remove nan values
df = df.dropna()

In [None]:
embeddings = df['comment'].map(lambda x: model.encode(x))

In [None]:
# converts embeddings into a dataframe
embeddings = embeddings.to_frame()
embeddings = embeddings['comment'].apply(pd.Series)

In [None]:
embeddings_df = pd.DataFrame(embeddings)

In [None]:
def cluster_x(embeddings_df, df, n_clusters, file_name):
    gmm = GMM(n_components=n_clusters, covariance_type='diag', max_iter=2000, random_state=0).fit(embeddings_df)
    labels = gmm.predict(embeddings_df)
    df_x = df.copy()
    df_x['label'] = labels
    df_x = df_x.sort_values(by='label')
    df_x.to_csv('../clustered_data/cluster/' + file_name + '_' + str(n_clusters) + '.csv', index=False)
    return df_x

In [None]:
df_2 = cluster_x(embeddings_df, df, 2, 'CNN-Full Speech: President Biden’s 2024 State of the Union address_liked')
df_5 = cluster_x(embeddings_df, df, 5, 'CNN-Full Speech: President Biden’s 2024 State of the Union address_liked')
df_10 = cluster_x(embeddings_df, df, 10, 'CNN-Full Speech: President Biden’s 2024 State of the Union address_liked')
df_20 = cluster_x(embeddings_df, df, 20, 'CNN-Full Speech: President Biden’s 2024 State of the Union address_liked')
df_30 = cluster_x(embeddings_df, df, 30, 'CNN-Full Speech: President Biden’s 2024 State of the Union address_liked')

In [None]:
embeddings_df.columns = embeddings_df.columns.map(str)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns


def plot_tsne_no_labels(df, title, random_state=0):
    tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300, random_state=random_state)
    tsne_results = tsne.fit_transform(df)
    df['tsne-2d-one'] = tsne_results[:,0]
    df['tsne-2d-two'] = tsne_results[:,1]
    plt.figure(figsize=(16,10))
    sns.scatterplot(
        x="tsne-2d-one", y="tsne-2d-two",
        data=df,
        legend=None,
        alpha=0.3
    )
    plt.title(title)
    plt.show()


def plot_tsne(embeddings_df, labels, title, random_state=0):
    df = embeddings_df.copy()
    df['label'] = labels
    tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300, random_state=random_state)
    tsne_results = tsne.fit_transform(df)
    df['tsne-2d-one'] = tsne_results[:,0]
    df['tsne-2d-two'] = tsne_results[:,1]
    plt.figure(figsize=(16,10))
    sns.scatterplot(
        x="tsne-2d-one", y="tsne-2d-two",
        hue=labels,
        palette=sns.color_palette("hsv", len(set(labels))),
        data=df,
        legend="full",
        alpha=0.3
    )
    plt.title(title)
    plt.show()

In [None]:
plot_tsne(embeddings_df, df_2['label'], '2 clusters')
plot_tsne(embeddings_df, df_5['label'], '5 clusters')
plot_tsne(embeddings_df, df_10['label'], '10 clusters')
plot_tsne(embeddings_df, df_20['label'], '20 clusters')
plot_tsne(embeddings_df, df_30['label'], '30 clusters')

In [None]:
# 3d plot
from mpl_toolkits.mplot3d import Axes3D

def plot_tsne_3d(df, labels, title, random_state=0):
    tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300, random_state=random_state)
    tsne_results = tsne.fit_transform(df)
    df['tsne-3d-one'] = tsne_results[:,0]
    df['tsne-3d-two'] = tsne_results[:,1]
    df['tsne-3d-three'] = tsne_results[:,2]
    fig = plt.figure(figsize=(16,10))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(df['tsne-3d-one'], df['tsne-3d-two'], df['tsne-3d-three'], c=labels, cmap='hsv', alpha=0.3, s=10)
    plt.title(title)
    plt.show()
    
plot_tsne_3d(embeddings_df, df_2['label'], '2 clusters')
plot_tsne_3d(embeddings_df, df_5['label'], '5 clusters')
plot_tsne_3d(embeddings_df, df_10['label'], '10 clusters')
plot_tsne_3d(embeddings_df, df_20['label'], '20 clusters')
plot_tsne_3d(embeddings_df, df_30['label'], '30 clusters')

In [None]:
# add comment to the dataframe
embeddings_comment = pd.concat([embeddings_df, df['comment']], axis=1)
embeddings_comment

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE

def plot_tsne_3d_interactive(df, labels, title, random_state=0):
    tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300, random_state=random_state)
    # embeddings are the first to the second last columns
    tsne_results = tsne.fit_transform(df[df.columns[:-1]])
    df['tsne-3d-one'] = tsne_results[:,0]
    df['tsne-3d-two'] = tsne_results[:,1]
    df['tsne-3d-three'] = tsne_results[:,2]
    
    fig = px.scatter_3d(
        df, x='tsne-3d-one', y='tsne-3d-two', z='tsne-3d-three',
        color=labels, title=title, opacity=0.7,
        hover_data={'comment': True}
    )
    
    # hover the comment when mouse is over the point
    fig.update_traces(marker=dict(size=3))
    fig.update_layout(
        scene=dict(
            xaxis_title='tsne-3d-one',
            yaxis_title='tsne-3d-two',
            zaxis_title='tsne-3d-three'
        ),
        width=1200,
        height=800
    )
    
    fig.show()

# Example usage
plot_tsne_3d_interactive(embeddings_comment, df_2['label'], '2 clusters')
plot_tsne_3d_interactive(embeddings_comment, df_5['label'], '5 clusters')
plot_tsne_3d_interactive(embeddings_comment, df_10['label'], '10 clusters')
plot_tsne_3d_interactive(embeddings_comment, df_20['label'], '20 clusters')
plot_tsne_3d_interactive(embeddings_comment, df_30['label'], '30 clusters')