# Where Copy : [this medium article](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)

> LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities

In [1]:
import pandas as pd
import random
import os

# Getting most scraped articles

In [3]:
df = pd.read_csv('../data/processed/author-parse-articles.csv')

In [4]:
# Getting Domains From News Websites 
m = df['art_article_url'].str.extract('(?<=http://)(.*?)(?=/)|(?<=https://)(.*?)(?=/)')
m = m[0].fillna(m[1]).fillna(df['art_article_url'])
df['art_extracted_domain'] = m

In [5]:
df.groupby('art_extracted_domain')['art_extracted_domain'].count().sort_values(ascending=False)

art_extracted_domain
www.nytimes.com        489
www.cnn.com            412
www.theguardian.com    258
www.reuters.com        217
www.aljazeera.com      181
                      ... 
warsawinstitute.org      1
wdnyradio.com            1
weartv.com               1
weisradio.com            1
zoso.ro                  1
Name: art_extracted_domain, Length: 902, dtype: int64

In [6]:
most_scraped = list(df.groupby('art_extracted_domain')['art_extracted_domain'].count().sort_values(ascending=False)[:50].index)

In [42]:
df_filtered = df[df['art_extracted_domain'].isin(most_scraped)]

In [43]:
len(df_filtered)

4098

# Topic Modelling for One Domain

In [44]:
import random

In [45]:
# List of Publications 
print(most_scraped)


['www.nytimes.com', 'www.cnn.com', 'www.theguardian.com', 'www.reuters.com', 'www.aljazeera.com', 'www.dailymail.co.uk', 'edition.cnn.com', 'www.rferl.org', 'www.nbcnews.com', 'www.cnbc.com', 'www.dw.com', 'abcnews.go.com', 'www.usnews.com', 'www.businessinsider.com', 'www.telegraph.co.uk', 'www.cbsnews.com', 'news.sky.com', 'apnews.com', 'www.npr.org', 'rubryka.com', 'www.voanews.com', 'thehill.com', 'nypost.com', 'www.pbs.org', 'foreignpolicy.com', 'www.politico.eu', 'hindustannewshub.com', 'www.themoscowtimes.com', 'www.axios.com', 'www.dailykos.com', 'www.latimes.com', 'www.express.co.uk', 'economictimes.indiatimes.com', 'www.foxnews.com', 'www.dailysabah.com', 'www.19fortyfive.com', 'www.thedrive.com', 'www.firstpost.com', 'www.mirror.co.uk', 'www.militarytimes.com', 'm.economictimes.com', 'www.atlanticcouncil.org', 'time.com', 'timesofindia.indiatimes.com', 'www.france24.com', 'www.scmp.com', 'www.cbc.ca', 'www.vox.com', 'charter97.org', 'www.theglobeandmail.com']


In [69]:
# Picking one news domain randomly
index_picked = random.randint(0,len(most_scraped))
organisation = most_scraped[index_picked]
print(f'Index Picked: {index_picked}\nOrgnisation Picked: {organisation}')

df_focus = df_filtered[df_filtered['art_extracted_domain']==organisation][['art_article_author',
                                                                           'art_article_date',
                                                                            'art_article_text',
                                                                           'ref_time-capture',
                                                                           'ref_full-text',
                                                                           'ref_title'
                                                                          ]]


Index Picked: 49
Orgnisation Picked: www.theglobeandmail.com


In [73]:
df_focus = df_focus.reset_index().head()

In [74]:
df_focus

Unnamed: 0,index,art_article_author,art_article_date,art_article_text,ref_time-capture,ref_full-text,ref_title
0,886,"['Mark Mackinnon', 'Murat Yukselir', 'The Glob...",2022-03-26 06:00:00-04:00,Kyiv residents march on Feb. 12 to show a unit...,2022-03-25 08:33:32.112068,SourceOn live map\r\nTell friends\r\n18 hours ...,Evacuation train Kyiv-Ivano-Frankivsk near Vas...
1,889,['David Ljunggren'],2022-03-26 16:52:33-04:00,A view of the square outside the damaged local...,2022-03-25 08:33:36.538618,SourceOn live map\r\nTell friends\r\n18 hours ...,Many fires across Kharkiv as result of Russian...
2,1026,"['Globe Staff', 'Murat Yukselir', 'The Globe A...",2022-03-01 01:44:04-05:00,This digest has now been archived. Find the la...,2022-02-27 22:04:07.644186,SourceOn live map\r\nTell friends\r\n5 hours a...,"Slovakia announces temporary residency, includ..."
3,1088,"['Globe Staff', 'Murat Yukselir', 'The Globe A...",2022-02-27 02:04:46-05:00,This digest has now been archived. Find the la...,2022-02-27 22:05:35.139451,SourceOn live map\r\nTell friends\r\n7 hours a...,EU foreign policy chief Borrell says part of t...
4,1209,['Globe Staff'],2022-04-03 10:21:09-04:00,A woman walks amid destroyed Russian tanks in ...,2022-04-03 12:43:36.388682,,Multiple casualties as result of Russian shell...


# Preprocessing 

In [75]:
# Load the regular expression library
import re
# Remove punctuation
df_focus['art_article_text_processed'] = \
df_focus['art_article_text'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
df_focus['art_article_text_processed'] = \
df_focus['art_article_text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
df_focus['art_article_text_processed'].head()

0    kyiv residents march on feb 12 to show a unite...
1    a view of the square outside the damaged local...
2    this digest has now been archived find the lat...
3    this digest has now been archived find the lat...
4    a woman walks amid destroyed russian tanks in ...
Name: art_article_text_processed, dtype: object

In [77]:
df_focus.loc[0]['art_article_text_processed']

"kyiv residents march on feb 12 to show a united front against russian president vladimir putin who would invade the country two weeks lateranton skyba/the globe and mail\r\n\r\nthe day began as most now do with air-raid sirens over the black sea port of odesa but last friday morning’s screams were accompanied by bursts of heavy shooting seemingly just a few blocks from where we were standing\r\n\r\ni stood on the sidewalk with a group of ukrainian men and speculated nervously these weren’t the singular spaced booms of artillery that we’d become sadly accustomed to in kyiv and other cities in the three weeks since this war began these more rapid sounds suggested ukrainian anti-aircraft fire targeting something overhead or – perhaps worse – small-arms fire somewhere inside the city\r\n\r\neither way it was time to move and for the first time since russian president vladimir putin ordered his army to invade ukraine i was heading not toward another ukrainian city to another part of the wa

# WordCloud

In [79]:
from wordcloud 

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
# Join the different processed titles together.
long_string = ','.join(list(papers['paper_text_processed'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
