In [1]:
import gensim
import pandas as pd
import csv
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_numeric
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from gensim.utils import simple_preprocess
import os
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re
from gensim.models import CoherenceModel
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.gensim_models

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diegojimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/diegojimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/diegojimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Load dataset from "updated_dataframe_with_clusters_word2vec.csv"
combined_df = pd.read_csv("/Users/diegojimenez/Documents/GitHub/computational_ds/data/output/updated_dataframe_with_clusters_word2vec.csv")


In [None]:
## Generate the dataframe with the scraped files

# Specify the folder path containing the scraped files
folder_path = '/Users/diegojimenez/Documents/GitHub/computational_ds/data/output'

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to store DataFrames
dataframes = []


for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

In [3]:
#Function to clean the text to the dataframe
def preprocess_text(df, text_column, custom_stopwords):
    clean_text_column = f"{text_column}_cleaned"
    lemmatizer = WordNetLemmatizer()
    
    #Function for cleaning a single text
    def clean_single_text(text):
        if not isinstance(text, str):
            return []

        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in custom_stopwords]
        tokens = [token for token in tokens if token.isalpha()]
        tokens = [token for token in tokens if not token.isdigit()]
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if token not in stop_words]

        # Add bigrams to the list of tokens
        bigram_tokens = list(bigrams(tokens))
        tokens += [' '.join(bigram) for bigram in bigram_tokens]

        return tokens

    # Apply the cleaning function to the specified column
    df[clean_text_column] = df[text_column].apply(clean_single_text)

    return df

# Define custom stopwords and stop words
custom_stopwords = ["reuters", "fox", "cnn", "was", "has", "us", "year"]
stop_words = set(stopwords.words('english'))

df_clean_text = preprocess_text(combined_df, "Text", custom_stopwords)

In [4]:
df_clean_text.head(5)

Unnamed: 0,ID,Headline,Date,Text,Organization,Link,Cluster,Text_cleaned
0,1,China says it’s building new homegrown aircraf...,"Updated 1:09 PM EST, Fri January 1, 2016",Story highlights The U.S. military has long be...,CNN,https://edition.cnn.com/2015/12/31/asia/china-...,619,"[story, highlight, military, long, believed, c..."
1,2,"Hillary Clinton emails: Kissinger, Photoshop a...","Updated 9:34 PM EST, Thu December 31, 2015",Story highlights The State Department released...,CNN,https://edition.cnn.com/2015/12/31/politics/cl...,-1,"[story, highlight, state, department, released..."
2,3,How the stars rang in 2016,"Published 9:04 AM EST, Fri January 1, 2016",Story highlights Some stars hit exotic locatio...,CNN,https://edition.cnn.com/2016/01/01/entertainme...,-1,"[story, highlight, star, hit, exotic, location..."
3,4,Smoke still wafting from Dubai’s luxury Addres...,"Updated 4:55 PM EST, Fri January 1, 2016",Story highlights NEW: Fire has been contained ...,CNN,https://edition.cnn.com/2016/01/01/middleeast/...,482,"[story, highlight, new, fire, contained, guest..."
4,5,Super PACs backing Cruz to launch TV ad blitz,"Published 8:36 PM EST, Thu December 31, 2015",Story highlights Ads supporting Cruz set to ai...,CNN,https://edition.cnn.com/2015/12/31/politics/cr...,1428,"[story, highlight, ad, supporting, cruz, set, ..."


In [5]:
# Create a dictionary from the documents
dic = corpora.Dictionary(df_clean_text["Text_cleaned"])
dic.filter_extremes(no_below=20, no_above=0.2)
# Create a corpus from the documents
corpus = [dic.doc2bow(doc) for doc in df_clean_text["Text_cleaned"]]
len(corpus)

446967

In [6]:
# Parameters to test for LDA
param_grid = {
    'num_topics': [10, 15, 20],
}


In [8]:
best_lda_model = None
best_score = float('-inf')
model_coherence_scores = []  
document_topic_probabilities = []

## Iterate over the specified range of number of topics
for num_topics in param_grid['num_topics']:
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dic, passes=5)
    # Coherence score for the current model
    coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=df_clean_text["Text_cleaned"], dictionary=dic, coherence='c_v')
    score = coherence_model.get_coherence()
    model_coherence_scores.append((num_topics, score))

    model_probabilities = []
    for i, doc in enumerate(corpus):
        topic_distribution = lda_model.get_document_topics(doc)
        model_probabilities.append((i, topic_distribution))
    
    document_topic_probabilities.append((num_topics, model_probabilities))
    
    # Update the best model if the current model has a higher coherence score
    if score > best_score:
        best_score = score
        best_lda_model = lda_model

    # Save coherence scores to CSV
coherence_csv_path = 'coherence_scores_3.csv'
with open(coherence_csv_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Num_Topics', 'Coherence_Score'])
    csv_writer.writerows(model_coherence_scores)

# Save document-topic probabilities to CSV
doc_topic_prob_csv_path = 'document_topic_probabilities_3.csv'
with open(doc_topic_prob_csv_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Num_Topics', 'Doc_ID', 'Topic_Distribution'])
    for num_topics, model_probabilities in document_topic_probabilities:
        for doc_id, topic_distribution in model_probabilities:
            csv_writer.writerow([num_topics, doc_id, topic_distribution])

In [9]:
# Generate a visualization of the best LDA model using pyLDAvis and save it as a HTML file
vis = pyLDAvis.gensim_models.prepare(best_lda_model, corpus, dic)
pyLDAvis.save_html(vis, 'best_lda_visualization_3.html')
pyLDAvis.display(vis)