# Topic modelling using BERTopic

## Libraries/data required

In [1]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
# Read the data and perform preprocessing

df = pd.read_csv("/Users/abdalrhman/Documents/DC/data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
# df = pd.read_csv("/Users/xuyou/Desktop/JBG060-DC3-23-24-Group24/geodata/results/articles_with_counties_regions.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

#def preprocess_text(text):
#    tokens = word_tokenize(text)
#    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
#    return tokens

#df['clean_summary'] = df['summary'].apply(preprocess_text).replace(",", "").replace("[", "").replace("]", "")
docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(11780, 7)


[nltk_data] Downloading package stopwords to /Users/xuyou/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/xuyou/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,summary,date,location_article,lat,lng,county,region
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125,Juba,Central Equatoria
1,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125,Juba,Central Equatoria
2,The article discusses the Elders' letter of co...,2011-07-07,Juba,4.859363,31.57125,Juba,Central Equatoria
3,The article discusses a mock parade conducted ...,2011-07-01,Juba,4.859363,31.57125,Juba,Central Equatoria
4,The article discusses the South Sudan governme...,2011-07-03,Juba,4.859363,31.57125,Juba,Central Equatoria


In [21]:
#juba_data = df[df['location_article'] == 'Juba']
#juba_docs = juba_data["summary"].tolist()
#juba_data.head()

#df['date'] = pd.to_datetime(df['date'])

# Group the data by year
#yearly_data = {}

#for year, group in df.groupby(df['date'].dt.year):
#    print(year)
#    yearly_data[year] = group

# Print the first few rows of data for each year
#for year, data in yearly_data.items():
#    print(f"Year {year}:")
#    print(data)
#    print("\n")



## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [3]:
if os.path.exists('southsudan_model_new'):
    bertopic = BERTopic.load('southsudan_model_new')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model_new") # Save the trained model as "southsudan_model"

Batches:   0%|          | 0/369 [00:00<?, ?it/s]

2023-10-08 19:11:18,941 - BERTopic - Transformed documents to Embeddings
2023-10-08 19:11:28,153 - BERTopic - Reduced dimensionality
2023-10-08 19:11:38,784 - BERTopic - Clustered reduced embeddings


In [23]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

In [4]:
bertopic = BERTopic.load("southsudan_model_new")

# Define the topic_id for the topic you want to get the centroid for
topic_id = -1  # Replace with the desired topic_id

# Get the centroid for the specified topic_id
centroid = bertopic.get_topic(topic_id)

print(f"Centroid for Topic {topic_id}:")
print(centroid)

Centroid for Topic -1:
[('the', 0.005088037502521287), ('and', 0.004949737321922009), ('of', 0.004869391782556451), ('to', 0.004807628697517097), ('in', 0.004724811576508257), ('sudan', 0.004590262882544427), ('south', 0.00453458233629906), ('article', 0.004435010722540989), ('discusses', 0.004202700507521112), ('for', 0.004201320288934422)]


In [5]:
#for year, data in yearly_data.items():
timestamps = df["date"].tolist()
topics_over_time = bertopic.topics_over_time(docs, timestamps)
bertopic.visualize_topics_over_time(topics_over_time, normalize_frequency=True, top_n_topics=10)


#for year, data in yearly_data.items():

2857it [00:26, 108.77it/s]


## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [6]:
bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

In [7]:
bertopic.visualize_barchart()

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [8]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [9]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity', 'harvests'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

3 0.47876966
96 0.4418324
95 0.43022704
184 0.42918918
22 0.36800718
143 0.3477329
74 0.31587836
5 0.29336277
144 0.28190637
165 0.26834887


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,168,3_food_million_fao_famine,"[food, million, fao, famine, hunger, insecurit...",[The article discusses the efforts of FAO to s...
96,26,96_malnutrition_children_unicef_nutrition,"[malnutrition, children, unicef, nutrition, br...",[The article discusses the severe acute malnut...
95,26,95_wfp_food_million_world,"[wfp, food, million, world, assistance, contri...",[The article discusses the U.S. government's a...
184,11,184_wfp_airdrops_food_maban,"[wfp, airdrops, food, maban, drops, yida, air,...",[The article discusses the United Nations Worl...
22,72,22_agriculture_agricultural_farmers_food,"[agriculture, agricultural, farmers, food, far...",[The article discusses the need for cooperatio...
143,16,143_refugees_maban_refugee_food,"[refugees, maban, refugee, food, unhcr, camps,...",[The article discusses the UNHCR and WFP calli...
74,34,74_prices_price_inflation_beverages,"[prices, price, inflation, beverages, goods, t...",[The article discusses a decrease in inflation...
5,157,5_million_humanitarian_aid_assistance,"[million, humanitarian, aid, assistance, fundi...",[The article discusses the commitment of the U...
144,16,144_livestock_animal_cattle_veterinary,"[livestock, animal, cattle, veterinary, diseas...",[The article discusses the efforts of the Repu...
165,14,165_riek_machar_salva_president,"[riek, machar, salva, president, ssdn, situati...",[The article discusses the peace deal between ...


In [10]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity', 'harvests', 'food'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

3 0.47876966
96 0.4418324
95 0.43022704
184 0.42918918
22 0.36800718
143 0.3477329
74 0.32729772
5 0.29336277
144 0.28190637
165 0.26834887


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,168,3_food_million_fao_famine,"[food, million, fao, famine, hunger, insecurit...",[The article discusses the efforts of FAO to s...
96,26,96_malnutrition_children_unicef_nutrition,"[malnutrition, children, unicef, nutrition, br...",[The article discusses the severe acute malnut...
95,26,95_wfp_food_million_world,"[wfp, food, million, world, assistance, contri...",[The article discusses the U.S. government's a...
184,11,184_wfp_airdrops_food_maban,"[wfp, airdrops, food, maban, drops, yida, air,...",[The article discusses the United Nations Worl...
22,72,22_agriculture_agricultural_farmers_food,"[agriculture, agricultural, farmers, food, far...",[The article discusses the need for cooperatio...
143,16,143_refugees_maban_refugee_food,"[refugees, maban, refugee, food, unhcr, camps,...",[The article discusses the UNHCR and WFP calli...
74,34,74_prices_price_inflation_beverages,"[prices, price, inflation, beverages, goods, t...",[The article discusses a decrease in inflation...
5,157,5_million_humanitarian_aid_assistance,"[million, humanitarian, aid, assistance, fundi...",[The article discusses the commitment of the U...
144,16,144_livestock_animal_cattle_veterinary,"[livestock, animal, cattle, veterinary, diseas...",[The article discusses the efforts of the Repu...
165,14,165_riek_machar_salva_president,"[riek, machar, salva, president, ssdn, situati...",[The article discusses the peace deal between ...


In [11]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced', 'refugee'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

7 0.6575956
17 0.58660877
143 0.5830115
141 0.5629349
183 0.5345123
56 0.5205677
168 0.5083705
147 0.50255495
5 0.49242938
46 0.48294407


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,152,7_refugees_unhcr_refugee_nile,"[refugees, unhcr, refugee, nile, yida, camp, c...",[The article discusses the shortage of humanit...
17,83,17_displaced_idps_people_un,"[displaced, idps, people, un, internally, cond...",[The article discusses the high number of inte...
143,16,143_refugees_maban_refugee_food,"[refugees, maban, refugee, food, unhcr, camps,...",[The article discusses the UNHCR and WFP calli...
141,16,141_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, refuge, displac...",[The article discusses new fighting in South S...
183,11,183_bentiu_base_dire_water,"[bentiu, base, dire, water, flooding, diseases...",[The article discusses the horrific living con...
56,39,56_returnees_repatriation_renk_idps,"[returnees, repatriation, renk, idps, iom, due...","[The article discusses the arrival of over 3,0..."
168,13,168_malakal_site_poc_shilluk,"[malakal, site, poc, shilluk, base, msf, prote...",[The article discusses the fighting that erupt...
147,15,147_kenyans_evacuation_kenyan_nigerians,"[kenyans, evacuation, kenyan, nigerians, niger...",[The article discusses the evacuation of Kenya...
5,157,5_million_humanitarian_aid_assistance,"[million, humanitarian, aid, assistance, fundi...",[The article discusses the commitment of the U...
46,46,46_pibor_humanitarian_jonglei_aid,"[pibor, humanitarian, jonglei, aid, affected, ...",[The article discusses the ongoing humanitaria...


In [12]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian', 'health', 'aid', 'rights','development', 'education'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

5 0.6439732
46 0.61934125
34 0.6069746
133 0.60627955
17 0.60167134
183 0.5979451
92 0.5876294
141 0.58454514
180 0.57476205
7 0.5734812


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,157,5_million_humanitarian_aid_assistance,"[million, humanitarian, aid, assistance, fundi...",[The article discusses the commitment of the U...
46,46,46_pibor_humanitarian_jonglei_aid,"[pibor, humanitarian, jonglei, aid, affected, ...",[The article discusses the ongoing humanitaria...
34,56,34_workers_aid_humanitarian_killing,"[workers, aid, humanitarian, killing, worker, ...",[The article discusses the killing of at least...
133,19,133_aid_workers_ngos_humanitarian,"[aid, workers, ngos, humanitarian, fees, permi...",[The article discusses the passing of a bill i...
17,83,17_displaced_idps_people_un,"[displaced, idps, people, un, internally, cond...",[The article discusses the high number of inte...
183,11,183_bentiu_base_dire_water,"[bentiu, base, dire, water, flooding, diseases...",[The article discusses the horrific living con...
92,27,92_red_cross_icrc_crescent,"[red, cross, icrc, crescent, medical, patients...",[The article discusses the humanitarian work o...
141,16,141_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, refuge, displac...",[The article discusses new fighting in South S...
180,11,180_malakal_unmiss_civilians_un,"[malakal, unmiss, civilians, un, internally, s...",[The article discusses violent clashes between...
7,152,7_refugees_unhcr_refugee_nile,"[refugees, unhcr, refugee, nile, yida, camp, c...",[The article discusses the shortage of humanit...


In [13]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder', 'war', 'violence', 'death', 'attacks'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]



181 0.43535787
29 0.43058836
86 0.42488897
35 0.41107562
82 0.403863
130 0.38709348
14 0.38104296
163 0.37946022
68 0.379444
10 0.37843966


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
181,11,181_positions_clashes_violating_opposition,"[positions, clashes, violating, opposition, ce...",[The article discusses accusations from the So...
29,64,29_sexual_women_violence_girls,"[sexual, women, violence, girls, rape, genderb...",[The article discusses the persistent and ende...
86,29,86_jonglei_conflicts_tribes_state,"[jonglei, conflicts, tribes, state, peace, dan...",[The article discusses a political reconciliat...
35,55,35_border_heglig_between_disputed,"[border, heglig, between, disputed, sudan, bot...",[The article discusses accusations by South Su...
82,29,82_gunfire_heavy_presidential_fighting,"[gunfire, heavy, presidential, fighting, juba,...",[The article discusses heavy gunfire at Juba i...
130,19,130_ceasefire_agreement_cessation_signing,"[ceasefire, agreement, cessation, signing, hos...",[The article discusses how South Sudan's gover...
14,103,14_peace_peacebuilding_society_civil,"[peace, peacebuilding, society, civil, agreeme...",[The article discusses the importance of civil...
163,14,163_conflict_nuer_displaced_tribe,"[conflict, nuer, displaced, tribe, dinka, riek...",[The article discusses the ongoing conflict in...
68,35,68_bentiu_killings_civilians_killed,"[bentiu, killings, civilians, killed, nuer, et...",[The article discusses the United Nations Miss...
10,134,10_igad_intergovernmental_authority_talks,"[igad, intergovernmental, authority, talks, pe...",[The article discusses South Sudanese Presiden...


In [14]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['oil', 'rebel_army', 'violence', 'reconciliation_peace', 'forces', 'security' , 'surveillance', 'ebola'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["vulnerability"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]



117 0.6414301
88 0.58671093
174 0.5035976
132 0.4933989
44 0.48809347
14 0.48462236
72 0.47128707
114 0.46779856
151 0.46724653
51 0.45608723


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
117,22,117_ebola_idsr_health_surveillance,"[ebola, idsr, health, surveillance, disease, l...",[The article discusses the Ministry of Health ...
88,27,88_reconciliation_healing_process_committee,"[reconciliation, healing, process, committee, ...",[The article discusses a 28-day reconciliation...
174,12,174_ssla_rebel_group_movementarmy,"[ssla, rebel, group, movementarmy, amnesty, ss...",[The article discusses the peace declaration m...
132,19,132_lra_kony_lords_resistance,"[lra, kony, lords, resistance, drc, task, afri...",[The article discusses a joint mission by Afri...
44,50,44_oil_production_petroleum_barrels,"[oil, production, petroleum, barrels, energy, ...",[The article discusses South Sudan's plans to ...
14,103,14_peace_peacebuilding_society_civil,"[peace, peacebuilding, society, civil, agreeme...",[The article discusses the importance of civil...
72,35,72_rebels_bentiu_control_unity,"[rebels, bentiu, control, unity, mayom, rebel,...",[The article discusses the South Sudanese army...
114,22,114_amnesty_militia_groups_olony,"[amnesty, militia, groups, olony, uliny, johns...",[The article discusses former militia leader G...
151,15,151_ssla_spla_attack_forces,"[ssla, spla, attack, forces, jonglei, loyal, s...",[The article discusses allegations by the SSLA...
51,44,51_signing_darfur_revolutionary_peace,"[signing, darfur, revolutionary, peace, front,...",[The article discusses the arrival of Sudanese...


In [15]:
original_df = pd.read_csv("geodata/results/articles_with_counties_regions.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
df = original_df.merge(
    df[["summary", "hunger", "refugees", "humanitarian", "conflict", "vulnerability"]],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("data/articles_topics.csv", index=False) # Save DataFrame to articles_topics.csv

In [16]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)  & (df["vulnerability"] == False)]))

11780
10013


There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!