In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
from collections import defaultdict


# Read the data and perform preprocessing

df = pd.read_csv("/Users/abdalrhman/Documents/DC/data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe

# Data preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

df['clean_summary'] = df['summary'].apply(preprocess_text)

# Grouping by year and building LDA models
years = df['date'].dt.year.unique()
topic_models = {}

for year in years:
    year_df = df[df['date'].dt.year == year]
    
    # Create a dictionary and a corpus for the year
    dictionary = corpora.Dictionary(year_df['clean_summary'])
    corpus = [dictionary.doc2bow(text) for text in year_df['clean_summary']]
    
    # Train LDA model
    lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)
    topic_models[year] = lda_model

# Get the top 10 topics for each year
top_topics_per_year = {}

for year, lda_model in topic_models.items():
    topics = lda_model.print_topics(num_topics=10, num_words=5)
    top_topics_per_year[year] = topics

# Print the top topics for each year
for year, topics in top_topics_per_year.items():
    print(f"Top 10 Topics for {year}:")
    for topic in topics:
        print(topic)
    print("\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdalrhman/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abdalrhman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Top 10 Topics for 2011:
(0, '0.037*"sudan" + 0.033*"south" + 0.025*"article" + 0.018*"discusses" + 0.017*"state"')
(1, '0.029*"south" + 0.028*"sudan" + 0.027*"article" + 0.019*"discusses" + 0.011*"government"')
(2, '0.032*"south" + 0.030*"sudan" + 0.028*"president" + 0.021*"article" + 0.018*"discusses"')
(3, '0.028*"sudan" + 0.022*"article" + 0.021*"south" + 0.017*"discusses" + 0.008*"republic"')
(4, '0.025*"sudan" + 0.025*"south" + 0.023*"article" + 0.015*"discusses" + 0.009*"also"')
(5, '0.061*"sudan" + 0.041*"south" + 0.021*"article" + 0.018*"oil" + 0.016*"discusses"')
(6, '0.017*"sudan" + 0.013*"south" + 0.011*"article" + 0.009*"discusses" + 0.007*"international"')
(7, '0.045*"sudan" + 0.036*"south" + 0.016*"article" + 0.016*"rebel" + 0.014*"army"')
(8, '0.028*"sudan" + 0.025*"article" + 0.025*"south" + 0.018*"discusses" + 0.009*"juba"')
(9, '0.050*"south" + 0.050*"sudan" + 0.022*"article" + 0.018*"discusses" + 0.010*"development"')


Top 10 Topics for 2012:
(0, '0.026*"sudan" + 0.