In [58]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 240)

# Bigram

In [59]:
import nltk
from collections import Counter

In [60]:
df =  pd.read_csv(
    "data/processed_data.csv", index_col=0
)

In [61]:
df['bigrams'] = df['headline'].astype('str').apply(lambda row: list(nltk.bigrams(row.split(' '))))
df

Unnamed: 0,link,headline,category,short_description,authors,date,words_clipped,words_clipped_headline,year,bigrams
0,https://www.huffpost.com/entry/covid-boosters-...,million american roll sleev omicrontarget covi...,U.S. NEWS,health expert said earli predict demand match ...,"Carla K. Johnson, AP",2022-09-23,29,11,2022,"[(million, american), (american, roll), (roll,..."
3,https://www.huffpost.com/entry/funniest-parent...,funniest tweet parent week sept,PARENTING,accident grownup toothpast toddler toothbrush ...,Caroline Bologna,2022-09-23,25,9,2022,"[(funniest, tweet), (tweet, parent), (parent, ..."
1,https://www.huffpost.com/entry/american-airlin...,american airlin flyer charg ban life punch fli...,U.S. NEWS,subdu passeng crew fled aircraft confront acco...,Mary Papenfuss,2022-09-23,28,13,2022,"[(american, airlin), (airlin, flyer), (flyer, ..."
2,https://www.huffpost.com/entry/funniest-tweets...,funniest tweet cat dog week sept,COMEDY,dog understand eaten,Elyse Wanshel,2022-09-23,12,13,2022,"[(funniest, tweet), (tweet, cat), (cat, dog), ..."
4,https://www.huffpost.com/entry/amy-cooper-lose...,woman call cop black birdwatch lose lawsuit ex...,U.S. NEWS,ami cooper accus invest firm franklin templeto...,Nina Golgowski,2022-09-22,25,11,2022,"[(woman, call), (call, cop), (cop, black), (bl..."
...,...,...,...,...,...,...,...,...,...,...
209520,https://www.huffingtonpost.comhttp://www.engad...,white hous chief technolog offic aneesh chopra...,TECH,appoint fair bit fanfar unit state chief techn...,,2012-01-28,23,10,2012,"[(white, hous), (hous, chief), (chief, technol..."
209521,https://www.huffingtonpost.com/entry/watch-top...,watch youtub video week,TECH,look popular youtub video week look bring,Catharine Smith,2012-01-28,22,9,2012,"[(watch, youtub), (youtub, video), (video, week)]"
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,rim ceo thorsten hein isignific plan blackberri,TECH,verizon wireless att promot lte devic includ s...,"Reuters, Reuters",2012-01-28,18,8,2012,"[(rim, ceo), (ceo, thorsten), (thorsten, hein)..."
209523,https://www.huffingtonpost.com/entry/maria-sha...,maria sharapova stun victoria azarenka austral...,SPORTS,afterward azarenka effus press normal credit c...,,2012-01-28,20,10,2012,"[(maria, sharapova), (sharapova, stun), (stun,..."


## (1) Bigram Count Dataframe

In [62]:
# Create all bigrams

all_bigrams = [bigram for bigrams_list in df['bigrams'] for bigram in bigrams_list]
all_bigrams

[('million', 'american'),
 ('american', 'roll'),
 ('roll', 'sleev'),
 ('sleev', 'omicrontarget'),
 ('omicrontarget', 'covid'),
 ('covid', 'booster'),
 ('funniest', 'tweet'),
 ('tweet', 'parent'),
 ('parent', 'week'),
 ('week', 'sept'),
 ('american', 'airlin'),
 ('airlin', 'flyer'),
 ('flyer', 'charg'),
 ('charg', 'ban'),
 ('ban', 'life'),
 ('life', 'punch'),
 ('punch', 'flight'),
 ('flight', 'attend'),
 ('attend', 'video'),
 ('funniest', 'tweet'),
 ('tweet', 'cat'),
 ('cat', 'dog'),
 ('dog', 'week'),
 ('week', 'sept'),
 ('woman', 'call'),
 ('call', 'cop'),
 ('cop', 'black'),
 ('black', 'birdwatch'),
 ('birdwatch', 'lose'),
 ('lose', 'lawsuit'),
 ('lawsuit', 'exemploy'),
 ('cleaner', 'dead'),
 ('dead', 'belk'),
 ('belk', 'bathroom'),
 ('bathroom', 'day'),
 ('day', 'bodi'),
 ('bodi', 'polic'),
 ('report', 'get'),
 ('get', 'ador'),
 ('ador', 'surpris'),
 ('surpris', 'boyfriend'),
 ('boyfriend', 'live'),
 ('live', 'tv'),
 ('puerto', 'rican'),
 ('rican', 'desper'),
 ('desper', 'water'),
 ('

In [63]:
# Count the occurrence of each bigram
all_bigram_counts = Counter(all_bigrams)

all_bigram_counts

Counter({('donald', 'trump'): 4594,
         ('hillari', 'clinton'): 1321,
         ('new', 'york'): 1213,
         ('white', 'hous'): 867,
         ('berni', 'sander'): 699,
         ('suprem', 'court'): 677,
         ('health', 'care'): 653,
         ('climat', 'chang'): 587,
         ('new', 'year'): 528,
         ('look', 'like'): 459,
         ('fashion', 'week'): 426,
         ('studi', 'find'): 423,
         ('super', 'bowl'): 421,
         ('morn', 'email'): 408,
         ('stephen', 'colbert'): 401,
         ('north', 'korea'): 396,
         ('kim', 'kardashian'): 367,
         ('ted', 'cruz'): 361,
         ('need', 'know'): 345,
         ('fox', 'news'): 336,
         ('taylor', 'swift'): 331,
         ('valentin', 'day'): 331,
         ('mother', 'day'): 325,
         ('paul', 'ryan'): 312,
         ('social', 'medium'): 309,
         ('photo', 'poll'): 308,
         ('sexual', 'assault'): 290,
         ('joe', 'biden'): 271,
         ('michell', 'obama'): 267,
         ('j

In [64]:
# Create a DataFrame from all_bigram_counts
df_all_bigram_counts = pd.DataFrame(list(all_bigram_counts.items()), columns=['bigram', 'count'])

df_all_bigram_counts.sort_values(by=['count'],ascending=False)

Unnamed: 0,bigram,count
1023,"(donald, trump)",4594
5936,"(hillari, clinton)",1321
1117,"(new, york)",1213
551,"(white, hous)",867
20653,"(berni, sander)",699
...,...,...
243303,"(fear, yazidi)",1
243302,"(keep, fear)",1
243301,"(repris, keep)",1
243300,"(isi, repris)",1


In [65]:
df_all_bigram_counts['bigram'] = df_all_bigram_counts['bigram'].apply(lambda x: '_'.join(x))
df_all_bigram_counts.sort_values(by=['count'],ascending=False)

Unnamed: 0,bigram,count
1023,donald_trump,4594
5936,hillari_clinton,1321
1117,new_york,1213
551,white_hous,867
20653,berni_sander,699
...,...,...
243303,fear_yazidi,1
243302,keep_fear,1
243301,repris_keep,1
243300,isi_repris,1


## (2) Bigram Count by Category Dataframe

In [66]:
# Create bigram for each Category

category_bigram_counts = {category: Counter() for category in df['category'].unique()}

# Count bigrams for each category
for index, row in df.iterrows():
    category = row['category']
    bigram_counts = Counter(row['bigrams'])
    category_bigram_counts[category] += bigram_counts

In [67]:
category_bigram_counts

{'U.S. NEWS': Counter({('new', 'york'): 24,
          ('suprem', 'court'): 14,
          ('jeffrey', 'epstein'): 13,
          ('tropic', 'storm'): 11,
          ('dead', 'injur'): 10,
          ('death', 'toll'): 10,
          ('leaf', 'dead'): 10,
          ('mass', 'shoot'): 9,
          ('los', 'angel'): 9,
          ('test', 'posit'): 9,
          ('santa', 'anita'): 9,
          ('hors', 'death'): 9,
          ('plane', 'crash'): 8,
          ('border', 'patrol'): 8,
          ('york', 'citi'): 8,
          ('california', 'wildfir'): 8,
          ('el', 'paso'): 8,
          ('offici', 'say'): 7,
          ('white', 'hous'): 7,
          ('arrest', 'alleg'): 7,
          ('sexual', 'assault'): 7,
          ('car', 'crash'): 7,
          ('southern', 'california'): 7,
          ('hate', 'crime'): 7,
          ('breonna', 'taylor'): 6,
          ('alex', 'jone'): 6,
          ('school', 'shoot'): 6,
          ('polic', 'chief'): 6,
          ('covid', 'shot'): 6,
          ('polic'

In [68]:
# Create a DataFrame with the top 100 most common bigrams for each category
top_bigrams = {category: dict(bigram_counts.most_common(200)) for category, bigram_counts in category_bigram_counts.items()}
df_top_bigrams = pd.DataFrame(top_bigrams).fillna(0)

df_top_bigrams

Unnamed: 0,Unnamed: 1,U.S. NEWS,PARENTING,COMEDY,WORLD NEWS,CULTURE & ARTS,TECH,SPORTS,ENTERTAINMENT,POLITICS,WEIRD NEWS,...,STYLE,GREEN,TASTE,HEALTHY LIVING,THE WORLDPOST,GOOD NEWS,WORLDPOST,FIFTY,ARTS,DIVORCE
new,york,24.0,13.0,11.0,0.0,13.0,17.0,43.0,52.0,205.0,6.0,...,17.0,13.0,9.0,13.0,5.0,5.0,0.0,0.0,12.0,5.0
suprem,court,14.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,560.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jeffrey,epstein,13.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tropic,storm,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dead,injur,10.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
grey,divorc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
friend,get,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
ex,photo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
men,divorc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [69]:
df_top_bigrams = df_top_bigrams.reset_index()
df_top_bigrams

Unnamed: 0,level_0,level_1,U.S. NEWS,PARENTING,COMEDY,WORLD NEWS,CULTURE & ARTS,TECH,SPORTS,ENTERTAINMENT,...,STYLE,GREEN,TASTE,HEALTHY LIVING,THE WORLDPOST,GOOD NEWS,WORLDPOST,FIFTY,ARTS,DIVORCE
0,new,york,24.0,13.0,11.0,0.0,13.0,17.0,43.0,52.0,...,17.0,13.0,9.0,13.0,5.0,5.0,0.0,0.0,12.0,5.0
1,suprem,court,14.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,jeffrey,epstein,13.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tropic,storm,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dead,injur,10.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6138,grey,divorc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
6139,friend,get,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
6140,ex,photo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
6141,men,divorc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [70]:
# Concatenate the first two columns to create a new 'bigram' column
df_top_bigrams['bigram'] = df_top_bigrams.iloc[:, 0].astype(str) + '_' + df_top_bigrams.iloc[:, 1].astype(str)
df_top_bigrams

Unnamed: 0,level_0,level_1,U.S. NEWS,PARENTING,COMEDY,WORLD NEWS,CULTURE & ARTS,TECH,SPORTS,ENTERTAINMENT,...,GREEN,TASTE,HEALTHY LIVING,THE WORLDPOST,GOOD NEWS,WORLDPOST,FIFTY,ARTS,DIVORCE,bigram
0,new,york,24.0,13.0,11.0,0.0,13.0,17.0,43.0,52.0,...,13.0,9.0,13.0,5.0,5.0,0.0,0.0,12.0,5.0,new_york
1,suprem,court,14.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,suprem_court
2,jeffrey,epstein,13.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jeffrey_epstein
3,tropic,storm,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tropic_storm
4,dead,injur,10.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,dead_injur
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6138,grey,divorc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,grey_divorc
6139,friend,get,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,friend_get
6140,ex,photo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,ex_photo
6141,men,divorc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,men_divorc


## (3) Concat two dataframe (1) + (2)

In [71]:
df_top_bigrams = df_top_bigrams.drop(columns=['level_0', 'level_1']) 

In [72]:
df_bigram_final = pd.merge(df_top_bigrams, df_all_bigram_counts, on=['bigram'])
df_bigram_final

Unnamed: 0,U.S. NEWS,PARENTING,COMEDY,WORLD NEWS,CULTURE & ARTS,TECH,SPORTS,ENTERTAINMENT,POLITICS,WEIRD NEWS,...,TASTE,HEALTHY LIVING,THE WORLDPOST,GOOD NEWS,WORLDPOST,FIFTY,ARTS,DIVORCE,bigram,count
0,24.0,13.0,11.0,0.0,13.0,17.0,43.0,52.0,205.0,6.0,...,9.0,13.0,5.0,5.0,0.0,0.0,12.0,5.0,new_york,1213
1,14.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,560.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,suprem_court,677
2,13.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jeffrey_epstein,19
3,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tropic_storm,33
4,10.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,dead_injur,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,grey_divorc,5
6139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,friend_get,6
6140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,ex_photo,5
6141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,men_divorc,5


In [73]:
# move `bigram` and `count` cols to first
count_data= df_bigram_final.pop('count')
bigram_data = df_bigram_final.pop('bigram')

df_bigram_final.insert(0, 'count', count_data)
df_bigram_final.insert(0, 'bigram', bigram_data)
df_bigram_final

Unnamed: 0,bigram,count,U.S. NEWS,PARENTING,COMEDY,WORLD NEWS,CULTURE & ARTS,TECH,SPORTS,ENTERTAINMENT,...,STYLE,GREEN,TASTE,HEALTHY LIVING,THE WORLDPOST,GOOD NEWS,WORLDPOST,FIFTY,ARTS,DIVORCE
0,new_york,1213,24.0,13.0,11.0,0.0,13.0,17.0,43.0,52.0,...,17.0,13.0,9.0,13.0,5.0,5.0,0.0,0.0,12.0,5.0
1,suprem_court,677,14.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,jeffrey_epstein,19,13.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tropic_storm,33,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dead_injur,44,10.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6138,grey_divorc,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
6139,friend_get,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
6140,ex_photo,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
6141,men_divorc,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [74]:
df_bigram_final_top_50 = df_bigram_final.sort_values(by=['count'], ascending=False)[:50]
df_bigram_final_top_50

Unnamed: 0,bigram,count,U.S. NEWS,PARENTING,COMEDY,WORLD NEWS,CULTURE & ARTS,TECH,SPORTS,ENTERTAINMENT,...,STYLE,GREEN,TASTE,HEALTHY LIVING,THE WORLDPOST,GOOD NEWS,WORLDPOST,FIFTY,ARTS,DIVORCE
393,donald_trump,4594,0.0,0.0,581.0,17.0,0.0,5.0,38.0,386.0,...,6.0,8.0,0.0,6.0,55.0,0.0,2.0,5.0,0.0,0.0
403,hillari_clinton,1321,0.0,0.0,38.0,0.0,0.0,0.0,0.0,49.0,...,8.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,new_york,1213,24.0,13.0,11.0,0.0,13.0,17.0,43.0,52.0,...,17.0,13.0,9.0,13.0,5.0,5.0,0.0,0.0,12.0,5.0
18,white_hous,867,7.0,0.0,45.0,0.0,0.0,6.0,7.0,30.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0
418,berni_sander,699,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,suprem_court,677,14.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
451,health_care,653,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,51.0,0.0,0.0,0.0,2.0,0.0,0.0
99,climat_chang,587,3.0,0.0,12.0,17.0,0.0,5.0,0.0,0.0,...,0.0,142.0,0.0,0.0,14.0,0.0,6.0,0.0,0.0,0.0
120,new_year,528,3.0,25.0,11.0,5.0,0.0,0.0,0.0,31.0,...,6.0,2.0,7.0,30.0,13.0,2.0,5.0,3.0,0.0,12.0
240,look_like,459,0.0,16.0,21.0,0.0,2.0,4.0,21.0,54.0,...,16.0,3.0,6.0,11.0,0.0,0.0,4.0,0.0,0.0,0.0


In [75]:
df_bigram_final_top_50.to_csv('data/news_bigram_data.csv', index=False)

## (4) Convert to long table

In [76]:
id_cols = ['bigram', 'count']

# Melt the DataFrame to long format
df_long_bigram_melt = pd.melt(
    df_bigram_final_top_50,
    id_vars=id_cols,
    value_vars=df_bigram_final_top_50.columns[2:],
    var_name='category',
    value_name='value'
)

df_long_bigram_melt

Unnamed: 0,bigram,count,category,value
0,donald_trump,4594,U.S. NEWS,0.0
1,hillari_clinton,1321,U.S. NEWS,0.0
2,new_york,1213,U.S. NEWS,24.0
3,white_hous,867,U.S. NEWS,7.0
4,berni_sander,699,U.S. NEWS,0.0
...,...,...,...,...
2095,week_photo,228,DIVORCE,0.0
2096,plan_parenthood,227,DIVORCE,0.0
2097,trevor_noah,226,DIVORCE,0.0
2098,marco_rubio,217,DIVORCE,0.0


In [77]:
total_rows = df_long_bigram_melt.groupby('bigram')['count'].max().reset_index()
total_rows['category'] = 'TOTAL'

df_long_bigram_total = total_rows.rename(columns={'bigram': 'bigram', 'category': 'category' ,'count': 'value'})
df_long_bigram_total = df_long_bigram_total[['bigram', 'category', 'value']]

df_long_bigram_total.sort_values(by=['value'], ascending=False, inplace=True)

df_long_bigram_total

Unnamed: 0,bigram,category,value
3,donald_trump,TOTAL,4594
9,hillari_clinton,TOTAL,1321
26,new_york,TOTAL,1213
48,white_hous,TOTAL,867
0,berni_sander,TOTAL,699
40,suprem_court,TOTAL,677
7,health_care,TOTAL,653
2,climat_chang,TOTAL,587
25,new_year,TOTAL,528
17,look_like,TOTAL,459


In [78]:
df_long_bigram_category = df_long_bigram_melt[['bigram', 'category', 'value']]
df_long_bigram = pd.concat([df_long_bigram_category, df_long_bigram_total])

df_long_bigram

Unnamed: 0,bigram,category,value
0,donald_trump,U.S. NEWS,0.0
1,hillari_clinton,U.S. NEWS,0.0
2,new_york,U.S. NEWS,24.0
3,white_hous,U.S. NEWS,7.0
4,berni_sander,U.S. NEWS,0.0
...,...,...,...
47,week_photo,TOTAL,228.0
32,plan_parenthood,TOTAL,227.0
43,trevor_noah,TOTAL,226.0
18,marco_rubio,TOTAL,217.0


In [79]:
df_long_bigram.to_csv('data/news_bigram_counts_data.csv',index=False)

# Plotly Chart

## (1) Top 10 bigram in Article Headlines


In [80]:
trace = go.Bar(
                x = df_bigram_final_top_50['count'][:10],
                y = df_bigram_final_top_50['bigram'][:10],
                text=df_bigram_final_top_50['count'][:10], textposition='outside', orientation='h')
layout = go.Layout(template= "plotly_white",title = 'TOP 10 BIGRAMS IN PUBLICATION HEADLINE' , xaxis = dict(title = 'Count', automargin=True,), yaxis = dict(title = 'Bigram',autorange="reversed"))
fig = go.Figure(data = [trace], layout = layout)
fig.show()

## (2) Comparison of bigrams for two categories

In [81]:
comp_first, comp_second = 'POLITICS','TOTAL'

comp_list = [comp_first, comp_second]
temp_df = df_long_bigram[df_long_bigram.category.isin(comp_list)]
temp_df.loc[temp_df.category == comp_list[-1], "value"] = -temp_df[
    temp_df.category == comp_list[-1]
].value.values

In [82]:
def comp_bigram_comparisons(comp_first, comp_second):
    comp_list = [comp_first, comp_second]
    temp_df = df_long_bigram[df_long_bigram.category.isin(comp_list)]
    temp_df.loc[temp_df.category == comp_list[-1], "value"] = -temp_df[temp_df.category == comp_list[-1]].value.values

    fig = px.bar(
        temp_df,
        title="Comparison: " + comp_first + " | " + comp_second,
        x="bigram",
        y="value",
        color="category",
        template="plotly_white",
        color_discrete_sequence=px.colors.qualitative.Bold,
        labels={"category": "Category:", "bigram": "Bi-Gram"},
    )
    fig.update_layout(legend=dict(x=0.1, y=1.1), legend_orientation="h")
    fig.update_yaxes(title="", showticklabels=False)
    fig.data[0]["hovertemplate"] = fig.data[0]["hovertemplate"][:-14]
    fig.show()

In [41]:
comp_bigram_comparisons('POLITICS','TOTAL')