In [None]:
import pandas as pd
import pickle

In [None]:
dataset="datasets/"

In [2]:
import os
import pandas as pd

def load_and_process_data(folder_path):
    csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]
    print(len(csv_files))
    print(csv_files)
    df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
    print(df["lang"].value_counts())
    print(df["post_type"].value_counts())
    print(df.shape)
    df = df[df["lang"] == "en"]
    print(df.shape)
    df = df[df["post_type"] != '0']
    print(df.shape)
    df = df[df["post_type"] != '1']
    print(df.shape)
    df = df.drop_duplicates(subset='item_id', keep='first')
    print(df.shape)
    return df


In [None]:
df = load_and_process_data(dataset)

In [None]:
df_unique = df[['content']].drop_duplicates().reset_index(drop=True)


In [10]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

embeddings = []
for text in tqdm(df_unique['content']):
    embeddings.append(get_embedding(text))


embedding_file = "content_embeddings.pkl"
pickle.dump(list(zip(df_unique['content'], embeddings)), open(embedding_file, 'wb'))

print(f"Embeddings saved to {embedding_file}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 338602/338602 [39:05<00:00, 144.37it/s]


Embeddings saved to content_embeddings.pkl


In [1]:
import numpy as np
import pickle
embedding_file = "content_embeddings.pkl"
content_embeddings = pickle.load(open(embedding_file, 'rb'))


X = np.array([emb for _, emb in content_embeddings])
content_texts = [text for text, _ in content_embeddings]

print(f"Loaded embeddings from {embedding_file}")


Loaded embeddings from content_embeddings.pkl


In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA

pca = PCA(n_components=3, random_state=42)
X_reduced = pca.fit_transform(X)

best_k = 60
kmeans = MiniBatchKMeans(n_clusters=best_k, random_state=42, batch_size=1024, n_init='auto')
labels = kmeans.fit_predict(X_reduced)

df_clustered = pd.DataFrame({'content': content_texts, 'cluster': labels})

cluster_file = "final_clustered_tweets.pkl"
pickle.dump(df_clustered, open(cluster_file, 'wb'))

top_texts = []
for cluster_num in range(best_k):
    cluster_indices = np.where(labels == cluster_num)[0]
    cluster_texts = [content_texts[i] for i in cluster_indices]
    cluster_embeddings = X_reduced[cluster_indices]
    centroid = kmeans.cluster_centers_[cluster_num].reshape(1, -1)
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    closest_texts = [cluster_texts[i] for i in np.argsort(distances)[:100]]
    print(f"Cluster {cluster_num}: {len(closest_texts)} texts selected")
    for text in closest_texts:
        top_texts.append({'cluster': cluster_num, 'content': text})

df_top_texts = pd.DataFrame(top_texts)
top_texts_file = "top_100_texts_per_cluster.pkl"
pickle.dump(df_top_texts, open(top_texts_file, 'wb'))