# Step4: Analyzing Twitter Data for Event-Related Clusters

In this notebook, we will analyze a dataset of tweets to identify event-related clusters. The primary steps include loading the data, preprocessing the tweets, determining the relatedness of clusters, extracting relevant n-grams, and predicting the relatedness of a test set using BERT embeddings.

### Process Overview
- Load the pre-clustered tweets and manually labeled cluster samples.
- Calculate the percentage of related tweets in each cluster.
- Generate a summary table for related and non-related clusters, including cluster size, text length, and word count statistics.
- Extract n-grams from related clusters using CountVectorizer to build the codebook.
- Save n-grams information to a CSV file and create a codebook for manual labeling.
- Use BERT embeddings to predict the relatedness of a test set and evaluate the model performance.

**Note**: Make sure to label the generated codebook, n-grams CSV file further steps.

In [None]:
import pandas as pd
import numpy as np
import pickle
from utils import preprocess_tweet, get_pretrained_model_and_tokenizer
import swifter
from transformers import BertTokenizer, BertModel
from IPython.display import display, Markdown
import torch
from source.utils import enhanced_stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from source.lm_classifier.main import pipeline
from source.lm_classifier.main import predict

In [None]:
tweets_df = pickle.load(open('../data/intermediate/input/step_4_clustered_tweets_with_embeddings.pkl', 'rb'))
labeled_clusters_df = pd.read_csv('../data/intermediate/input/step_4_cluster_samples_manually_labeled.csv')

In [None]:
# for each cluster sum is_related column and divide by the number of tweets in the cluster
# to get the percentage of tweets that are related to the event
cluster_relatedness_df = labeled_clusters_df.groupby('cluster').agg({'is_related': 'sum', 'id': 'count'}).reset_index()
cluster_relatedness_df['relatedness'] = cluster_relatedness_df['is_related'] / cluster_relatedness_df['id']
cluster_relatedness_df['is_related'] = cluster_relatedness_df['relatedness'] > 0.5
cluster_relatedness_df = cluster_relatedness_df[['cluster', 'is_related']]

In [None]:
# for is_related true and false clusters print the summary with following information 
# number of clusters, number of tweets, min cluster size, max cluster size, avg. cluster size, min text length, max text length, avg. text length, min word count, max word count, avg. word count

tweets_df['text_length'] = tweets_df['text'].apply(lambda x: len(x))
tweets_df['word_count'] = tweets_df['text'].apply(lambda x: len(x.split(' ')))
tweets_df['cluster_size'] = tweets_df['cluster'].map(tweets_df['cluster'].value_counts())

summary_df = tweets_df.merge(cluster_relatedness_df, on='cluster', how='left')
summary_df = summary_df.groupby('is_related').agg({'cluster': 'nunique', 'id': 'count', 'cluster_size': ['min', 'max', 'mean'], 'text_length': ['min', 'max', 'mean'], 'word_count': ['min', 'max', 'mean']}).reset_index()

# print df as markdown table
display(Markdown(summary_df.T.to_markdown()))

In [None]:
# get only related clusters
related_tweets_df = tweets_df[tweets_df['cluster'].isin(cluster_relatedness_df[cluster_relatedness_df['is_related']]['cluster'].tolist())]
# get n-grams up to 3-grams and keep n-grams having 0.01 min. document-frequency and 0.85 max. document-frequency 
# to get rid of very common and very rare n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=0.001, max_df=0.85, stop_words=enhanced_stop_words)
v_fit = vectorizer.fit_transform(related_tweets_df['processed_text'].tolist())

# create a df with n-grams and their document-frequency
n_grams_df = pd.DataFrame({'n_gram': vectorizer.get_feature_names_out(), 'document_frequency': v_fit.toarray().sum(axis=0)}).sort_values('document_frequency', ascending=False)
n_grams_df.to_csv('../data/intermediate/output/step_4_n_grams.csv', index=False)
n_grams_df['synonym'] = None
n_grams_df['category'] = None
n_grams_df[['n_gram', 'synonym', 'category']].to_csv('../data/intermediate/input/step_5_codebook_manually_labeled.csv', index=False)


In [None]:
model_name = 'sbert' # options are bert, roberta, sbert, sroberta
model, tokenizer = get_pretrained_model_and_tokenizer(model_name)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
test_set = pd.read_csv('../data/test_set_for_relatedness.csv')
test_set['processed_text'] = test_set['text'].swifter.apply(preprocess_tweet)
processed_texts = test_set['processed_text'].tolist()
inputs = tokenizer(processed_texts, return_tensors="pt", padding=True, truncation=True).to(device)
model = model.to(device)
outputs = model(**inputs)
test_set['embedding'] = outputs['last_hidden_state'].to('cpu').mean(dim=1).detach().numpy().tolist()


In [None]:
# get cluster centers for each cluster as dict
tweets_df['embedding'] = tweets_df['embedding'].apply(lambda x: np.array(x))
cluster_centers = tweets_df.groupby('cluster').agg({'embedding': 'mean'}).to_dict()['embedding']
closest_cluster_count = 7
thresholds = [4./closest_cluster_count, 5./closest_cluster_count]
# predict cluster for each tweet in test set by calculating cosine similarity between tweet embedding and cluster centers
# get closest 7 clusters and assign label of majority as related or not related
test_set['cluster'] = test_set['embedding'].apply(lambda x: sorted(cluster_centers.keys(), key=lambda y: np.dot(x, cluster_centers[y]), reverse=True)[:closest_cluster_count])
for threshold in thresholds:
    test_set['is_related_prediction'] = test_set['cluster'].apply(lambda x: cluster_relatedness_df[cluster_relatedness_df['cluster'].isin(x)]['is_related'].mean() >= threshold).astype(int)
    
    # print precision, recall, accuracy and f1 by comparing label and is_related_prediction columns
    precision = test_set[(test_set['label'] == 1) & (test_set['is_related_prediction'] == 1)].shape[0] / test_set[test_set['is_related_prediction'] == 1].shape[0]
    recall = test_set[(test_set['label'] == 1) & (test_set['is_related_prediction'] == 1)].shape[0] / test_set[test_set['label'] == 1].shape[0]
    accuracy = test_set[test_set['label'] == test_set['is_related_prediction']].shape[0] / test_set.shape[0]
    f1 = 2 * precision * recall / (precision + recall)
    print(f'Closest cluster count: {closest_cluster_count}, Threshold: {threshold}, Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}, F1: {f1}')

In [None]:
dataset = tweets_df[['id', 'text', 'cluster']]
dataset['label'] = dataset['cluster'].isin(cluster_relatedness_df[cluster_relatedness_df['is_related']]['cluster'].tolist()).astype(int)

dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset = dataset[['id', 'text', 'label']]
# lm-based classifier is implemented on top of following repo: https://github.com/raufer/text-classification-pytorch

modelname = 'vanillabert' # options are vanillabert, sbert, roberta, sroberta
config = {
    'num-epochs-pretrain': 5,
    'num-epochs-train': 5,
    'learning-rate-pretrain': 9e-4,
    'learning-rate-train': 2e-5,
    'dropout-ratio': 0.4,
    'threshold': 0.5, # 0.5 or 0.95
}

model, y_true, y_pred, output_path, train_dataset, val_dataset, test_dataset = pipeline(
    datapath=dataset,
    modelname=modelname,
    output_dir='data/outputs',
    config=config
)

score = f1_score(y_true, y_pred, average='weighted')
print(f"weighted f1-score '{score}'")


In [None]:
test_set = pd.read_csv('../data/test_set_for_relatedness.csv')
predict(modelname, test_set, output_path, threshold=config['threshold'])