## Import Libraries

In [None]:
import os
import pandas as pd
import requests
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
import re
from zipfile import BadZipFile
import pyLDAvis
import pyLDAvis.gensim_models

## Define Country List and Timeframe

In [None]:
# List of sub-Saharan African countries (ISO Alpha-2 country codes)
country_list = pd.read_excel('Country List.xlsx')
blacklist_countries = country_list[country_list['IS_Africa'] == 0]['Alpha-2 code'].tolist()
sub_saharan_countries = country_list[country_list['IS_Africa'] == 1]['Alpha-2 code'].tolist()

# Date range
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 6, 30)

# Generate a list of dates for the time range
date_list = [(start_date + timedelta(days=x)).strftime('%Y%m%d') for x in range((end_date - start_date).days + 1)]
time_list = ['000000', '040000', '080000', '120000', '160000', '200000']

## Load and Clean Data

In [None]:
# Function to download and process GDELT GKG data for a specific date
def process_gdelt_gkg(date, time):
    url = f"http://data.gdeltproject.org/gdeltv2/{date}{time}.gkg.csv.zip"
    zip_path = f"{date}{time}.zip"
    
    # Check if the file already exists
    if not os.path.exists(zip_path):
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
    
    # Read the zip file
    try:
        # Read the zip file
        df = pd.read_csv(zip_path, compression='zip', header=None, delimiter='\t', encoding='latin-1')
        return df
    except BadZipFile:
        print(f"BadZipFile error encountered for {zip_path}. Skipping this file.")
        return pd.DataFrame()  # Return an empty DataFrame if a BadZipFile error occurs

# Initialize an empty DataFrame to hold all data
all_data = pd.DataFrame()

# Loop through each date and process the data
for date in date_list:
    for time in time_list:
        daily_data = process_gdelt_gkg(date, time)
        all_data = pd.concat([all_data, daily_data], ignore_index=True)

In [None]:
# Drop NA
all_data.dropna(subset=[9], how='all', inplace=True)

# Filter data for sub-Saharan African countries
filtered_data = all_data[~all_data[9].str.contains('|'.join(blacklist_countries))]

## News Title LDA Analysis

In [None]:
def extract_page_title(text):
    match = re.search(r'<PAGE_TITLE>(.*?)</PAGE_TITLE>', text)
    return match.group(1) if match else None

filtered_data['Page_Title'] = filtered_data[26].apply(extract_page_title)
filtered_data = filtered_data.fillna('')
stop_words = set(stopwords.words('english'))
filtered_data['Processed_Title'] = filtered_data['Page_Title'].apply(
    lambda x: ' '.join([word for word in word_tokenize(x.lower()) if word.isalpha() and word not in stop_words])
)

texts = [text.split() for text in filtered_data['Processed_Title']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Perform LDA analysis
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")

In [None]:
# Visualize the LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.display(vis)

In [None]:
# Assign each document (title) to the dominant topic
dominant_topics = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]

# Add dominant topic to the original dataframe
filtered_data['Dominant_Topic'] = dominant_topics

# Function to print sample titles for each topic
def print_sample_titles_per_topic(df, topic_num, sample_size=5):
    print(f"\nSample titles for Topic {topic_num+1}:")
    sample_titles = df[df['Dominant_Topic'] == topic_num]['Page_Title'].sample(n=sample_size, random_state=1).tolist()
    for title in sample_titles:
        print(f"- {title}")

# Print sample titles for each topic
for topic in range(lda_model.num_topics):
    print_sample_titles_per_topic(filtered_data, topic)

## GKG Analysis

In [None]:
# Split the themes into separate columns
# Create a new DataFrame to hold the expanded themes
expanded_data = pd.DataFrame(filtered_data[7].str.split(';').tolist(), index=filtered_data.index)

# Count the themes for each incident
theme_counts = expanded_data.stack().value_counts()

# Convert to DataFrame
theme_counts_df = pd.DataFrame(theme_counts).reset_index()
theme_counts_df.columns = ['Theme', 'Frequency']

# Drop first empty row
theme_counts_df = theme_counts_df.iloc[1:]

In [None]:
# Plot the top 10 themes
top_themes = theme_counts_df.head(10)
plt.figure(figsize=(10, 6))
plt.bar(top_themes['Theme'], top_themes['Frequency'])
plt.xlabel('Theme')
plt.ylabel('Frequency')
plt.title('Top 10 Themes in Sub-Saharan Africa (Jan 2024 - Jun 2024)')
plt.xticks(rotation=90)
plt.show()