# Step 6: Generating Paragraphs for Topic Modeled Tweets

In this section, we'll generate paragraphs for a set of tweets based on topic modeling results. The tweets have been preprocessed, clustered, and labeled manually. The main steps include:

#### 1. Cluster Relatedness Calculation
- Read the codebook and clustered tweets with embeddings.
- Determine the relatedness of each cluster to the event by calculating the percentage of related tweets.
- Create a DataFrame of clusters and their relatedness.

#### 2. Vectorization and Topic Modeling
- Vectorize the text of related tweets using CountVectorizer with n-grams.
- Use a guided topic modeling approach with seed words from the codebook to categorize tweets into topics.

#### 3. Topic Predictions for Test Set
- Read a test set for topic modeling.
- Tokenize and vectorize the test set tweets.
- Predict the top 5 categories for each tweet based on the trained topic model.

#### 4. Embedding and Paragraph Generation
- Use BERT embeddings for both tweets and predefined sentences.
- Generate paragraphs for each tweet based on the predicted categories.
- Calculate the average cosine similarity between selected sentences in the generated paragraphs.

#### 5. Output
- Save the original tweets and generated paragraphs to a CSV file for further analysis.

**Note**: The resulting CSV file, 'step_6_paragraphs.csv', contains the original tweets and the generated paragraphs.


In [None]:
import pandas as pd
import torch
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from lda import guidedlda
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer

from utils import enhanced_stop_words, preprocess_tweet, calculate_topic_modeling_score, \
    get_pretrained_model_and_tokenizer
import numpy as np

In [None]:
codebook = pd.read_csv('../data/intermediate/input/step_5_codebook_manually_labeled.csv')
tweets_df = pickle.load(open('../data/intermediate/input/step_4_clustered_tweets_with_embeddings.pkl', 'rb'))
labeled_clusters_df = pd.read_csv('../data/intermediate/input/step_4_cluster_samples_manually_labeled.csv')
# for each cluster sum is_related column and divide by the number of tweets in the cluster
# to get the percentage of tweets that are related to the event
cluster_relatedness_df = labeled_clusters_df.groupby('cluster').agg({'is_related': 'sum', 'id': 'count'}).reset_index()
cluster_relatedness_df['relatedness'] = cluster_relatedness_df['is_related'] / cluster_relatedness_df['id']
cluster_relatedness_df['is_related'] = cluster_relatedness_df['relatedness'] > 0.5
cluster_relatedness_df = cluster_relatedness_df[['cluster', 'is_related']]
related_tweets_df = tweets_df[tweets_df['cluster'].isin(cluster_relatedness_df[cluster_relatedness_df['is_related']]['cluster'].tolist())]

In [None]:
# dictionary of codebook as key is the category and value is the list of n_grams
codebook_dict = {x: codebook.loc[codebook['category'] == x, 'n_gram'].tolist() for x in codebook['category'].unique()}
category_ids = {x: i for i, x in enumerate(codebook['category'].unique())}
category_ids_inv = {i: x for i, x in enumerate(codebook['category'].unique())}

vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=0.001, max_df=0.85, stop_words=enhanced_stop_words)
v_fit = vectorizer.fit_transform(related_tweets_df['processed_text'].tolist())
word2id = dict((v, idx) for idx, v in enumerate(vectorizer.get_feature_names_out()))

seed_topics = {}
for cagegory, seed_words in codebook_dict.items():
    for word in seed_words:
        if word not in word2id:
            continue
        seed_topics[word2id[word]] = category_ids[cagegory]

In [None]:
TOPIC_NUMBER = 10
NITER = 25
ALPHA = .3 #
ETA = .05
CONF = 1
IN_OR_OUT = 1
TOP_N_WORDS = 20
window_sizes = [0, 1, 2, 5]

model = guidedlda.GuidedLDA(
        n_topics=TOPIC_NUMBER,
        n_iter=NITER,
        random_state=0,
        alpha=ALPHA,
        eta=ETA
    )
model.fit(v_fit, seed_topics=seed_topics, seed_confidence=CONF)
test_set_df = pd.read_csv('../data/test_set_for_topic_modeling.csv')
test_set_df['processed_text'] = test_set_df['text'].apply(preprocess_tweet)
test_set_df['vectorized_text'] = test_set_df['processed_text'].apply(lambda x: vectorizer.transform([x]))
# get top5 topics for each tweet as cat1_pred, cat2_pred, cat3_pred, cat4_pred, cat5_pred
test_set_df['topic_predictions'] = test_set_df['vectorized_text'].apply(lambda x: model.transform(x)[0])
for i in range(1,6):
    test_set_df[f'cat{i}_pred'] = test_set_df['topic_predictions'].apply(lambda x: category_ids_inv.get(np.argsort(x)[-i],f"temp{np.argsort(x)[-i]-7}"))


In [None]:
predefined_sentences_df = pd.read_csv('../data/predefined_sentences.csv')
strategy = "source_tweet" # source_tweet or previous_sentence
model_name = 'bert' # options are bert, roberta, sbert, sroberta
model, tokenizer = get_pretrained_model_and_tokenizer(model_name)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
predefined_sentences_df['processed_sentence'] = predefined_sentences_df['sentence'].apply(preprocess_tweet)
predefined_sentences_df['embedding'] = predefined_sentences_df['processed_sentence'].apply(lambda x: model(**tokenizer(x, return_tensors="pt", padding=True, truncation=True).to(device))['last_hidden_state'].mean(dim=0).detach().cpu().numpy())

test_set_df['embedding'] = test_set_df['processed_text'].apply(lambda x: model(**tokenizer(x, return_tensors="pt", padding=True, truncation=True).to(device))['last_hidden_state'].mean(dim=0).detach().cpu().numpy())
paragraphs = []
paragraph_cos_sim = []
for idx,row in test_set_df.iterrows():
    selected_sentences = []
    # for each category get the highest cosine similarity sentences from predefined sentences df with the same category
    for i in range(1,6):
        cat = row[f'cat{i}_pred']
        if strategy == "previous_sentence" and selected_sentences:
            cos_sim = predefined_sentences_df[predefined_sentences_df['category'] == cat]['embedding'].apply(lambda x: np.dot(x, selected_sentences[-1]['embedding']) / (np.linalg.norm(x) * np.linalg.norm(selected_sentences[-1]['embedding']))).tolist()
        else:
            cos_sim = predefined_sentences_df[predefined_sentences_df['category'] == cat]['embedding'].apply(lambda x: np.dot(x, row['embedding']) / (np.linalg.norm(x) * np.linalg.norm(row['embedding']))).tolist()
        selected_sentences.append(predefined_sentences_df[predefined_sentences_df['category'] == cat].iloc[np.argmax(cos_sim)])
    
    # calculate cosine similarity between the selected sentences
    selected_sentences_cos_sims = [np.dot(x['embedding'], y['embedding']) / (np.linalg.norm(x['embedding']) * np.linalg.norm(y['embedding'])) for x,y in zip(selected_sentences, selected_sentences[1:])]
    
    paragraphs.append(". ".join([x['sentence'] for x in selected_sentences]))
    paragraph_cos_sim.append(np.mean(selected_sentences_cos_sims))

paragraph_cos_sim = np.mean(paragraph_cos_sim)

test_set_df['paragraph'] = paragraphs

test_set_df[['text', 'paragraph']].to_csv('../data/intermediate/output/step_6_paragraphs.csv', index=False)