In [1]:
import pandas as pd
import pickle

with open("data/articles_cleaned.pkl", "rb") as file:
    articles = pickle.load(file)


In [2]:
with open("pickleFiles/LDA_modelList_15_30_by_5.pkl", "rb") as file:
    topicModel = pickle.load(file)

topicModel = topicModel[2]

In [3]:
from gensim.corpora import Dictionary

def prepareForLDA(list_tokenized, stopwords = []):
    """
    Prepares a dictionary and a corpus for LDA
    Parameters:
        - list_tokenized: a list of tokenized documents (i.e. a list of lists of tokens)
        - stopwords: words that still need to be excluded from the list_tokenized
    """
    # make a new list_tokenized without the stopwords
    list_stopwordsExcluded = []

    for doc in list_tokenized:
        list_stopwordsExcluded.append([word for word in doc if word not in stopwords])

    dictionary = Dictionary(list_stopwordsExcluded) # get the vocabulary

    corpus = [dictionary.doc2bow(text) for text in list_stopwordsExcluded]

    return dictionary, corpus


articles_dictionaryLDA, articles['corpusLDA'] = prepareForLDA(articles['cleaned_article'])


In [5]:
# names of topics in model object
topicModel_names = {0: 'sports', 1: 'war', 2: 'EUPolitics', 3: 'crimes', 4: '4 - NaN', 5: 'israelPalestine', 6: 'immigration', 7: 'showArts', 8: '8 - NaN', 9: '9 - NaN', 10: 'spaceTravel', 11: 'elections', 12: 'instAbuse', 13: 'protest', 14: 'terrorism', 15: '15 - NaN', 16: '16 - NaN', 17: 'economy', 18: '18 - NaN', 19: '19 - NaN', 20: 'USPolitics', 21: 'britishBrexit', 22: 'covid', 23: 'climateChange', 24: 'naturalDisasters'}


In [6]:
# create a topics dataframe for all articles

def topics(currentArticle):
    currentArticle_bow = articles.iloc[currentArticle]['corpusLDA'] # make bow representation of current article
    topics_article = list(topicModel.get_document_topics(currentArticle_bow, minimum_probability = 0.05)) # get list of topic distributions of current article
    topics_article = [(topicModel_names[t[0]], t[1]) for t in topics_article] # get topic names

    topics_distribution = {'EUPolitics': 0, 'crimes': 0, 'israelPalestine':0, 'immigration': 0, 'sports': 0, 'war': 0, 'climateChange': 0, 'showArts': 0, 'covid': 0, 'britishBrexit': 0, 'instAbuse': 0, 'spaceTravel': 0, 'protest': 0, 'terrorism': 0, 'USPolitics': 0, 'naturalDisasters': 0, 'elections': 0, 'economy': 0}
    topics_distribution.update(topics_article)

    for key in list(topics_distribution.keys()):
        if 'NaN' in key: # delete topics that could not be manually identified (and thus are not included in the interest data)
            del topics_distribution[key]

    return dict(topics_distribution)


# create a topics dataframe for all articles
global articles_topics
articles_topics = pd.DataFrame(0, index = articles.index, columns = topicModel_names.values())

for idx in range(len(articles)):
    articles_topics.iloc[idx] = topics(idx)

In [None]:
articles_topics.to_csv('data/articles_topics.csv')