# Trial 2

In [5]:
import re
import pandas as pd
from datetime import datetime
import nltk

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer


from bertopic.vectorizers import ClassTfidfTransformer


from bertopic.representation import KeyBERTInspired



In [6]:
# Load data and inspect

df = pd.read_csv('../../data/interim/cleaned.csv')
# Filter
timestamps = df.year.to_list()
texts = df.text.to_list()



In [None]:
#Define model

from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_lg")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.2)]

representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}

vectorizer_model = CountVectorizer(min_df=5)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

topic_model = BERTopic(nr_topics = 'auto', 
                       ctfidf_model=ctfidf_model,
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model)

topics, ini_probs = topic_model.fit_transform(texts)
topics_over_time = topic_model.topics_over_time(texts, timestamps) #this is dynamic topic modeling. 


In [18]:
def get_topic_stats(topic_model, extra_cols = []):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 
                           'Name', 'Representation'] + extra_cols]

get_topic_stats(topic_model, ['Aspect1', 'Aspect2']).head(10)\
    .set_index('Topic')

Unnamed: 0_level_0,Count,Share,CumulativeShare,Name,Representation,Aspect1,Aspect2
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,6803,64.373581,64.373581,0_nations_republic_general_africa,"[nations, republic, general, africa, national,...","[nations, united, international, will, world, ...","[nations, republic, general, africa, organizat..."
-1,2960,28.009084,92.382665,-1_nations_organization_cooperation_general,"[nations, organization, cooperation, general, ...","[united, nations, international, will, world, ...","[nations, organization, cooperation, efforts, ..."
1,96,0.908403,93.291067,1_palestinians_palestinian_palestine_israeli,"[palestinians, palestinian, palestine, israeli...","[peace, people, will, international, state, wo...","[palestinians, palestinian, palestine, israeli..."
2,72,0.681302,93.972369,2_terrorism_terrorists_terrorist_afghanistan,"[terrorism, terrorists, terrorist, afghanistan...","[terrorism, united, nations, international, wi...","[terrorism, terrorists, terrorist, afghanistan..."
3,60,0.567752,94.540121,3_mongolian_mongolia_turkmenistan_republic,"[mongolian, mongolia, turkmenistan, republic, ...","[united, international, nations, security, peo...","[mongolian, mongolia, turkmenistan, nations, k..."
4,46,0.435276,94.975397,4_nations_disarmament_canadians_united,"[nations, disarmament, canadians, united, effo...","[to, nations, united, will, more, security, wo...","[nations, disarmament, canadians, united, effo..."
5,33,0.312263,95.287661,5_cuba_cuban_rico_imperialism,"[cuba, cuban, rico, imperialism, nations, poli...","[united, government, people, peoples, will, co...","[cuba, cuban, rico, imperialism, nations, poli..."
6,32,0.302801,95.590462,6_ukraine_ukrainian_humanitarian_russia,"[ukraine, ukrainian, humanitarian, russia, nat...","[war, security, global, will, nations, interna...","[ukraine, humanitarian, nations, war, russias,..."
7,30,0.283876,95.874338,7_philippines_nations_asean_manila,"[philippines, nations, asean, manila, presiden...","[nations, for, united, world, will, developmen...","[philippines, nations, asean, manila, agenda, ..."
8,30,0.283876,96.158213,8_ireland_irish_nations_peacekeeping,"[ireland, irish, nations, peacekeeping, united...","[nations, will, nuclear, conflict, more, peace...","[ireland, irish, nations, peacekeeping, united..."


In [None]:
freq = topic_model.get_topic_info(); freq.head(10)

In [None]:
topic_model.get_topic(10)

In [21]:
topics_over_time = topic_model.topics_over_time(docs=texts, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

In [None]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [None]:
topic_model.visualize_heatmap(n_clusters=20) #this shows which topics are related. 

# Topics per class 

In [None]:
#they are talking about the same thing, but the way they talk about it is different. 
class_topics = topic_model.topics_per_class(texts, classes) #this is for class-based topic modeling

In [None]:
class_topics.visualize_topics_per_class(class_topics, top_n_topics=10, normalize_frequency = True)