In [2]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from pprint import pprint

### LDA - Topic Modelling

In [3]:
df = pd.read_csv("data/cleaned_data.csv")
df = df.sort_values(by=['date'], ascending=False)

df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,words_clipped,words_clipped_headline,year
187020,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,29,11,2022
187019,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,25,9,2022
187018,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,28,13,2022
187017,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,12,13,2022
187016,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,25,11,2022


In [4]:
processed_df = pd.read_csv("data/processed_data.csv", index_col=0)
processed_df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,words_clipped,words_clipped_headline,year
0,https://www.huffpost.com/entry/covid-boosters-...,million american roll sleev omicrontarget covi...,U.S. NEWS,health expert said earli predict demand match ...,"Carla K. Johnson, AP",2022-09-23,29,11,2022
3,https://www.huffpost.com/entry/funniest-parent...,funniest tweet parent week sept,PARENTING,accident grownup toothpast toddler toothbrush ...,Caroline Bologna,2022-09-23,25,9,2022
1,https://www.huffpost.com/entry/american-airlin...,american airlin flyer charg ban life punch fli...,U.S. NEWS,subdu passeng crew fled aircraft confront acco...,Mary Papenfuss,2022-09-23,28,13,2022
2,https://www.huffpost.com/entry/funniest-tweets...,funniest tweet cat dog week sept,COMEDY,dog understand eaten,Elyse Wanshel,2022-09-23,12,13,2022
4,https://www.huffpost.com/entry/amy-cooper-lose...,woman call cop black birdwatch lose lawsuit ex...,U.S. NEWS,ami cooper accus invest firm franklin templeto...,Nina Golgowski,2022-09-22,25,11,2022


In [5]:
## Tokenization again but already remove stopword, lemmization and stemming

data = list(processed_df['headline'].values)

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(data))

In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]
[[('american', 1), ('booster', 1), ('covid', 1), ('million', 1), ('omicrontarget', 1), ('roll', 1), ('sleev', 1)]]


In [106]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           passes=15)

In [107]:
lda_model.save("model/model_lda.model")

In [108]:
# Load model
from gensim.models.ldamodel import LdaModel
lda = LdaModel.load("model/model_lda.model")

In [127]:
lda_model_test = LdaModel.load("model/model_lda.model")

In [109]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.015*"studi" + 0.012*"poll" + 0.010*"health" + 0.009*"idea" + 0.009*"new" '
  '+ 0.008*"network" + 0.008*"guid" + 0.007*"hous" + 0.006*"diy" + '
  '0.006*"plan"'),
 (1,
  '0.013*"parent" + 0.012*"tip" + 0.012*"dress" + 0.011*"kid" + 0.010*"thing" '
  '+ 0.010*"life" + 0.010*"child" + 0.009*"video" + 0.008*"cancer" + '
  '0.008*"day"'),
 (2,
  '0.079*"photo" + 0.019*"video" + 0.017*"week" + 0.016*"new" + 0.016*"day" + '
  '0.016*"divorc" + 0.014*"style" + 0.011*"best" + 0.011*"recip" + '
  '0.011*"love"'),
 (3,
  '0.019*"video" + 0.014*"marriag" + 0.009*"hotel" + 0.008*"healthi" + '
  '0.007*"craft" + 0.006*"school" + 0.006*"wed" + 0.006*"happi" + 0.005*"date" '
  '+ 0.005*"teen"'),
 (4,
  '0.039*"photo" + 0.018*"wed" + 0.016*"fashion" + 0.012*"way" + 0.011*"day" + '
  '0.009*"need" + 0.009*"talk" + 0.008*"video" + 0.008*"mom" + 0.008*"make"')]


In [112]:
sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(lda_model[corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df._append(
                pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                ignore_index=True,
            )
        else:
            break

# Add original text to the end of the output
contents = pd.Series(list(df['headline'].values))

sent_topics_df = pd.concat([sent_topics_df, contents, pd.Series(list(df["category"].values)), pd.Series(list(df["date"].values))], axis=1)

sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords", "headline", "category", "date"]

In [113]:
sent_topics_df

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,headline,category,date
0,4,0.3399,"photo, wed, fashion, way, day, need, talk, vid...",Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,2022-09-23
1,1,0.7432,"parent, tip, dress, kid, thing, life, child, v...",23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,2022-09-23
2,1,0.6273,"parent, tip, dress, kid, thing, life, child, v...",The Funniest Tweets From Parents This Week (Se...,PARENTING,2022-09-23
3,1,0.4880,"parent, tip, dress, kid, thing, life, child, v...","American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,2022-09-23
4,3,0.5315,"video, marriag, hotel, healthi, craft, school,...",Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,2022-09-22
...,...,...,...,...,...,...
187017,2,0.5463,"photo, video, week, new, day, divorc, style, b...",Daily Show Correspondent Clip Of The Week: Al ...,COMEDY,2012-01-28
187018,0,0.6329,"studi, poll, health, idea, new, network, guid,...",Mitt Romney Madness: Florida Edition (VIDEO),COMEDY,2012-01-28
187019,1,0.5500,"parent, tip, dress, kid, thing, life, child, v...",7 Amazing Name Generators (PHOTOS),COMEDY,2012-01-28
187020,2,0.3509,"photo, video, week, new, day, divorc, style, b...",Russian Cargo Ship Docks At International Spac...,SCIENCE,2012-01-28


In [115]:
sent_topics_df.to_csv('data/lda_data.csv',index=False)

# Plotly Chart

In [125]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from collections import Counter


In [157]:
df_dominant_topic = pd.read_csv('data/lda_data.csv')
df_dominant_topic

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,headline,category,date
0,4,0.3399,"photo, wed, fashion, way, day, need, talk, vid...",Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,2022-09-23
1,1,0.7432,"parent, tip, dress, kid, thing, life, child, v...",23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,2022-09-23
2,1,0.6273,"parent, tip, dress, kid, thing, life, child, v...",The Funniest Tweets From Parents This Week (Se...,PARENTING,2022-09-23
3,1,0.4880,"parent, tip, dress, kid, thing, life, child, v...","American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,2022-09-23
4,3,0.5315,"video, marriag, hotel, healthi, craft, school,...",Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,2022-09-22
...,...,...,...,...,...,...
187017,2,0.5463,"photo, video, week, new, day, divorc, style, b...",Daily Show Correspondent Clip Of The Week: Al ...,COMEDY,2012-01-28
187018,0,0.6329,"studi, poll, health, idea, new, network, guid,...",Mitt Romney Madness: Florida Edition (VIDEO),COMEDY,2012-01-28
187019,1,0.5500,"parent, tip, dress, kid, thing, life, child, v...",7 Amazing Name Generators (PHOTOS),COMEDY,2012-01-28
187020,2,0.3509,"photo, video, week, new, day, divorc, style, b...",Russian Cargo Ship Docks At International Spac...,SCIENCE,2012-01-28


In [160]:
df_dominant_topic_groupby = df_dominant_topic.groupby(by=['Dominant_Topic','Topic_Keywords','category'])['headline'].count().reset_index()
df_dominant_topic_groupby.columns = ['Dominant_Topic','Topic_Keywords','Category', 'Count']
df_dominant_topic_groupby

Unnamed: 0,Dominant_Topic,Topic_Keywords,Category,Count
0,0,"studi, poll, health, idea, new, network, guid,...",ARTS,115
1,0,"studi, poll, health, idea, new, network, guid,...",ARTS & CULTURE,267
2,0,"studi, poll, health, idea, new, network, guid,...",BLACK VOICES,793
3,0,"studi, poll, health, idea, new, network, guid,...",BUSINESS,2037
4,0,"studi, poll, health, idea, new, network, guid,...",COLLEGE,263
...,...,...,...,...
205,4,"photo, wed, fashion, way, day, need, talk, vid...",WEIRD NEWS,370
206,4,"photo, wed, fashion, way, day, need, talk, vid...",WELLNESS,3915
207,4,"photo, wed, fashion, way, day, need, talk, vid...",WOMEN,677
208,4,"photo, wed, fashion, way, day, need, talk, vid...",WORLD NEWS,486


In [118]:
top_category = (
    df.groupby("category")["link"]
    .count()
    .sort_values(ascending=False)[:20]
    .index
)

top_category

Index(['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY',
       'PARENTING', 'FOOD & DRINK', 'QUEER VOICES', 'HEALTHY LIVING',
       'BUSINESS', 'COMEDY', 'SPORTS', 'HOME & LIVING', 'BLACK VOICES',
       'THE WORLDPOST', 'WEDDINGS', 'PARENTS', 'DIVORCE', 'WORLD NEWS',
       'WOMEN'],
      dtype='object', name='category')

In [119]:
top_category_dominant_topic_df = df_dominant_topic_groupby[df_dominant_topic_groupby["Category"].isin(top_category)]
top_category_dominant_topic_df

Unnamed: 0,Dominant_Topic,Topic_Keywords,Category,Count
2,0,"studi, poll, health, idea, new, network, guid,...",BLACK VOICES,793
3,0,"studi, poll, health, idea, new, network, guid,...",BUSINESS,2037
5,0,"studi, poll, health, idea, new, network, guid,...",COMEDY,1940
8,0,"studi, poll, health, idea, new, network, guid,...",DIVORCE,382
10,0,"studi, poll, health, idea, new, network, guid,...",ENTERTAINMENT,2055
...,...,...,...,...
202,4,"photo, wed, fashion, way, day, need, talk, vid...",TRAVEL,1674
204,4,"photo, wed, fashion, way, day, need, talk, vid...",WEDDINGS,1227
206,4,"photo, wed, fashion, way, day, need, talk, vid...",WELLNESS,3915
207,4,"photo, wed, fashion, way, day, need, talk, vid...",WOMEN,677


## (1) Word Count and Importance of Topic Keywords


In [165]:
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_words for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i, weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])

# Create subplot
fig = make_subplots(rows=1, cols=5, subplot_titles=['Topic: ' + str(i) for i in range(5)],
                    shared_yaxes=True, vertical_spacing=0.2)

for i in range(5):
    subplot_df = df.loc[df.topic_id == i, :]
    
    # Word Count Bar with text outside and numbers on bars
    fig.add_trace(
        go.Bar(x=subplot_df['word'], y=subplot_df['word_count'], name='Word Count',
               text=subplot_df.apply(lambda row: f"Word Count: {row['word_count']}, Importance: {row['importance']:.3f}", axis=1),
               hoverinfo='text+x+y',  # Set hover information
               textposition='outside'),  # Set text position outside the bars
        row=1, col=i + 1
    )

# Update layout
fig.update_layout(
    title_text='Word Count and Importance of Topic Keywords',
    showlegend=False
)

fig.show()

## (2) Topic Volum across Top Category

In [120]:
fig = px.bar(top_category_dominant_topic_df, 
             x='Dominant_Topic', 
             y='Count',
             text='Count',
             hover_data=['Topic_Keywords'],
             color='Category',
             labels={'Count':'Number of Articles','Dominant_Topic':'Dominant Topic'},
             barmode="group",
             )

fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

In [122]:
top_category_dominant_topic_df['Dominant_Topic'] = top_category_dominant_topic_df['Dominant_Topic'].astype(str)

fig = px.bar(top_category_dominant_topic_df, 
             x='Category', 
             y='Count',
             text='Count',
             hover_data=['Topic_Keywords'],
             color='Dominant_Topic',
             labels={'Count':'Number of Articles'},
             barmode="stack",
            #  facet_col="Dominant_Topic",
            #  category_orders={"Dominant_Topic": ["0", "1", "2", "3", "4"]}
             )

fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## (3) Dominant Topic-Category Distribution

In [178]:
df_dominant_topic = pd.read_csv('data/lda_data.csv')
df_dominant_topic

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,headline,category,date
0,4,0.3399,"photo, wed, fashion, way, day, need, talk, vid...",Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,2022-09-23
1,1,0.7432,"parent, tip, dress, kid, thing, life, child, v...",23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,2022-09-23
2,1,0.6273,"parent, tip, dress, kid, thing, life, child, v...",The Funniest Tweets From Parents This Week (Se...,PARENTING,2022-09-23
3,1,0.4880,"parent, tip, dress, kid, thing, life, child, v...","American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,2022-09-23
4,3,0.5315,"video, marriag, hotel, healthi, craft, school,...",Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,2022-09-22
...,...,...,...,...,...,...
187017,2,0.5463,"photo, video, week, new, day, divorc, style, b...",Daily Show Correspondent Clip Of The Week: Al ...,COMEDY,2012-01-28
187018,0,0.6329,"studi, poll, health, idea, new, network, guid,...",Mitt Romney Madness: Florida Edition (VIDEO),COMEDY,2012-01-28
187019,1,0.5500,"parent, tip, dress, kid, thing, life, child, v...",7 Amazing Name Generators (PHOTOS),COMEDY,2012-01-28
187020,2,0.3509,"photo, video, week, new, day, divorc, style, b...",Russian Cargo Ship Docks At International Spac...,SCIENCE,2012-01-28


In [204]:
df_dominant_topic_corr = df_dominant_topic_groupby[['Dominant_Topic','Category','Count']]
df_dominant_topic_corr = df_dominant_topic_corr[df_dominant_topic_groupby["Category"]!='POLITICS']

In [205]:
heatmap_data = df_dominant_topic_corr.pivot(index='Dominant_Topic', columns='Category', values='Count')
heatmap_data

Category,ARTS,ARTS & CULTURE,BLACK VOICES,BUSINESS,COLLEGE,COMEDY,CRIME,CULTURE & ARTS,DIVORCE,EDUCATION,...,TECH,THE WORLDPOST,TRAVEL,U.S. NEWS,WEDDINGS,WEIRD NEWS,WELLNESS,WOMEN,WORLD NEWS,WORLDPOST
Dominant_Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,115,267,793,2037,263,1940,281,114,382,238,...,550,1102,1337,346,349,325,3641,643,928,394
1,202,288,921,787,198,634,372,185,598,151,...,432,430,2229,197,439,559,4766,718,344,201
2,278,369,748,638,95,873,155,364,1119,87,...,372,385,2761,152,971,461,2841,433,430,206
3,131,157,1078,675,259,509,1866,227,634,273,...,353,1346,1386,575,661,544,2504,634,1102,186
4,137,240,579,978,106,630,154,170,668,152,...,305,394,1674,107,1227,370,3915,677,486,255


In [207]:
# px.imshow(heatmap_data, text_auto=True, aspect="auto")

fig = px.imshow(heatmap_data, color_continuous_scale='Teal', origin='lower', text_auto=True, aspect="auto")

fig.update_yaxes(title='Dominate Topic')
fig.update_xaxes(title='Category (Excluding POLICTICS)')
fig.show()