In [2]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from pprint import pprint


In [3]:
df = pd.read_csv("data/processed_data.csv", index_col=0)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,words_clipped,words_clipped_headline
0,https://www.huffpost.com/entry/covid-boosters-...,million american roll sleev omicrontarget covi...,U.S. NEWS,health expert said earli predict demand match ...,"Carla K. Johnson, AP",2022-09-23,29,11
2,https://www.huffpost.com/entry/funniest-tweets...,funniest tweet cat dog week sept,COMEDY,dog understand eaten,Elyse Wanshel,2022-09-23,12,13
3,https://www.huffpost.com/entry/funniest-parent...,funniest tweet parent week sept,PARENTING,accident grownup toothpast toddler toothbrush ...,Caroline Bologna,2022-09-23,25,9
1,https://www.huffpost.com/entry/american-airlin...,american airlin flyer charg ban life punch fli...,U.S. NEWS,subdu passeng crew fled aircraft confront acco...,Mary Papenfuss,2022-09-23,28,13
4,https://www.huffpost.com/entry/amy-cooper-lose...,woman call cop black birdwatch lose lawsuit ex...,U.S. NEWS,ami cooper accus invest firm franklin templeto...,Nina Golgowski,2022-09-22,25,11


In [17]:
## Tokenization again

data = list(df['headline'].values)

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]
[[('american', 1), ('booster', 1), ('covid', 1), ('million', 1), ('omicrontarget', 1), ('roll', 1), ('sleev', 1)]]


In [26]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)

In [27]:
lda_model.save("model/model_lda_100.model")

In [28]:
# Load model
from gensim.models.ldamodel import LdaModel
lda = LdaModel.load("model/model_lda_100.model")

In [29]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.025*"travel" + 0.024*"poll" + 0.018*"mother" + 0.017*"spring" + '
  '0.017*"famili" + 0.013*"man" + 0.012*"school" + 0.012*"right" + '
  '0.012*"black" + 0.011*"nation"'),
 (1,
  '0.043*"way" + 0.041*"look" + 0.037*"home" + 0.028*"tip" + 0.023*"like" + '
  '0.021*"babi" + 0.020*"need" + 0.017*"know" + 0.014*"model" + 0.013*"hair"'),
 (2,
  '0.237*"photo" + 0.096*"video" + 0.022*"recip" + 0.020*"dress" + 0.017*"get" '
  '+ 0.013*"eat" + 0.008*"evolut" + 0.008*"realli" + 0.008*"perfect" + '
  '0.007*"break"'),
 (3,
  '0.033*"marriag" + 0.023*"good" + 0.019*"guid" + 0.018*"obama" + 0.015*"dad" '
  '+ 0.014*"valentin" + 0.013*"step" + 0.012*"come" + 0.011*"fear" + '
  '0.011*"watch"'),
 (4,
  '0.035*"studi" + 0.034*"fashion" + 0.027*"child" + 0.021*"find" + '
  '0.018*"talk" + 0.017*"cancer" + 0.017*"design" + 0.016*"fall" + '
  '0.015*"show" + 0.015*"weight"'),
 (5,
  '0.040*"love" + 0.022*"live" + 0.021*"beauti" + 0.020*"star" + 0.015*"super" '
  '+ 0.015*"heart" + 0.014*"noth"

In [42]:
for i, row in enumerate(lda_model[corpus]):
    print(i,"row:", row)
    break

0 row: ([(0, 0.18569507), (1, 0.07249465), (2, 0.082396485), (3, 0.11860351), (4, 0.07902971), (5, 0.06865962), (6, 0.07279318), (7, 0.098926), (8, 0.1464891), (9, 0.074912645)], [(0, [8]), (1, [0]), (2, [9]), (3, [0]), (4, []), (5, [3]), (6, [])], [(0, [(8, 0.9999357)]), (1, [(0, 0.9946801)]), (2, [(9, 0.02032671)]), (3, [(0, 0.99991685)]), (4, []), (5, [(3, 0.99953455)]), (6, [])])


In [54]:
sorted(row, key=lambda x: (x[1]), reverse=True)

TypeError: '<' not supported between instances of 'int' and 'tuple'

In [55]:
def format_topics_sentences(ldamodel, corpus, texts, dates):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    contents = pd.Series(texts)

    sent_topics_df = pd.concat([sent_topics_df, contents, pd.Series(dates)], axis=1)
    return sent_topics_df

In [67]:
sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(lda_model[corpus]):
    row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df._append(
                pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                ignore_index=True,
            )
        else:
            break

# Add original text to the end of the output
contents = pd.Series(data)

sent_topics_df = pd.concat([sent_topics_df, contents, pd.Series(list(df["date"].values))], axis=1)

sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords", "headline", "date"]

In [68]:
sent_topics_df

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,headline,date
0,0,0.1857,"travel, poll, mother, spring, famili, man, sch...",million american roll sleev omicrontarget covi...,2022-09-23
1,7,0.3813,"day, new, week, kid, life, world, parent, food...",funniest tweet cat dog week sept,2022-09-23
2,7,0.3496,"day, new, week, kid, life, world, parent, food...",funniest tweet parent week sept,2022-09-23
3,0,0.2628,"travel, poll, mother, spring, famili, man, sch...",american airlin flyer charg ban life punch fli...,2022-09-23
4,0,0.2166,"travel, poll, mother, spring, famili, man, sch...",woman call cop black birdwatch lose lawsuit ex...,2022-09-22
...,...,...,...,...,...
187017,2,0.2253,"photo, video, recip, dress, get, eat, evolut, ...",daili correspond clip week al madrig biggest m...,2012-01-28
187018,9,0.1640,"wed, style, health, idea, hotel, coupl, reason...",mitt romney mad florida edit video,2012-01-28
187019,2,0.1442,"photo, video, recip, dress, get, eat, evolut, ...",amaz generat photo,2012-01-28
187020,0,0.2993,"travel, poll, mother, spring, famili, man, sch...",russian cargo ship dock intern space station,2012-01-28


In [69]:
sent_topics_df.to_csv('data/lda_data.csv',index=False)