In [1]:
import pandas as pd
import requests
import datetime
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.cluster import KMeans
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import pandas as pd
from political_utils import clustering as cl
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load a data frame with required columns
df = pd.read_pickle('resources/data/kav_clean_072119.pkl')

In [3]:
df = df[df['article']!='']  # dirty fix

In [4]:
# add number of words column
df['word_count'] = df['article'].apply(lambda x: len(x.split(' ')))

In [5]:
len(df)

4306

In [23]:
cues = [
    'senate judiciary committee',
    'the whitehouse',
    'spokesperson',
    'spokesman',
    'spokeswoman',
    'republican senators',
    'chuck grassley',
    'grassley',
    'thom tillis',
    'tillis',
    'mike crapo',
    'crapo',
    'jeff flake',
    'flake',
    'ben sasse',
    'sasse',
    'ted cruz',
    'cruz',
    'mike lee',
    'lee',
    'john cornyn',
    'cornyn',
    'lindsey graham',
    'graham',
    'orrin hatch',
    'john neely kennedy',
    'kennedy',
    'democratic senators',
    'dianne feinstein',
    'feinstein',
    'kamala harris',
    'harris',
    'mazie hirono',
    'hirono'
    'richard blumenthal',
    'blumenthal',
    'chris coons',
    'coons',
    'cory booker',
    'booker',
    'amy klobuchar',
    'klobuchar',
    'sheldon whitehouse',
    'dick durbin',
    'durbin',
    'patrick leahy',
    'republican officials',
    'donald trump',
    'trump',
    'kellyanne conway',
    'conway'
]

In [31]:
def political_cue_stats(cues, article):
    text = article.lower()
    stats = {}
    total = 0
    for c in cues:
        count = text.count(c)
        stats[c] = count
        total += count
    stats['total'] = count
    return stats

In [32]:
df['article'].iloc[0].count('Dianne Feinstein')

1

In [33]:
df['cue_stats'] = df['article'].apply(lambda x: political_cue_stats(cues, x))

In [34]:
df['total'] = df['cue_stats'].apply(lambda x: x['total'])

In [36]:
cue_df = df[df['total']>0]

In [37]:
cue_df

Unnamed: 0,publish_date,url,title,authors,media_site,article,issue,to_drop,word_count,cue_stats,total
133,9/17/18,http://feedproxy.google.com/~r/breitbart/~3/1k...,Conway: Kavanaugh Accuser Shouldn't Be 'Insult...,[Ian Hanchett],Breitbart,On Monday’s broadcast of the Fox News Channel’...,False,False,243,"{'senate judiciary committee': 0, 'the whiteho...",3
215,9/17/18,http://feedproxy.google.com/~r/breitbart/~3/K5...,Brett Kavanaugh Ready to Refute 'False Allegat...,[Charlie Spiering],Breitbart,Supreme Court Justice nominee Brett Kavanaugh ...,False,False,227,"{'senate judiciary committee': 2, 'the whiteho...",1
311,9/17/18,http://feedproxy.google.com/~r/breitbart/~3/Z1...,Kellyanne Conway: Brett Kavanaugh Accuser Shou...,[Charlie Spiering],Breitbart,The White House responded Monday to a woman wh...,False,False,212,"{'senate judiciary committee': 1, 'the whiteho...",3
358,10/1/18 6:28,https://www.cnsnews.com/news/article/susan-jon...,Now Democrats Find Fault With Scope of FBI Inv...,[Susan Jones],CNS News,Christine Blasey Ford is greeted by Sen. Mazie...,False,False,1201,"{'senate judiciary committee': 2, 'the whiteho...",7
374,10/5/18 4:30,https://www.cnsnews.com/commentary/patrick-j-b...,We Are All Deplorables Now,[Patrick J. Buchanan],CNS News,President Trump defends his Supreme Court nomi...,False,False,845,"{'senate judiciary committee': 0, 'the whiteho...",1
385,9/24/18 14:44,https://www.cnsnews.com/blog/emily-ward/new-ka...,"New Kavanaugh Accusations: Not One Eyewitness,...",[Emily Ward],CNS News,Supreme Court nominee Brett Kavanaugh. (YouTub...,False,False,721,"{'senate judiciary committee': 1, 'the whiteho...",1
468,9/24/18 17:27,https://www.conservativereview.com/news/cnns-r...,CNNäó»s resident äóÖsleaze balläó» Brian Stelt...,[Rob Eno],conservativereview.com,CNN’s resident ‘sleaze ball’ Brian Stelter act...,False,False,382,"{'senate judiciary committee': 0, 'the whiteho...",1
478,9/27/18 17:26,https://www.conservativereview.com/news/lindse...,Lindsey Graham just made jaws drop at the Kava...,[Chris Pandolfo],conservativereview.com,Lindsey Graham just made jaws drop at the Kava...,False,False,294,"{'senate judiciary committee': 1, 'the whiteho...",1
685,10/5/18 12:14,https://www.dailywire.com/news/36775/kellyanne...,Kellyanne Conway: Longest Confirmation Process...,[],Daily Wire,The fight for Brett Kavanaugh has been the lon...,False,False,260,"{'senate judiciary committee': 0, 'the whiteho...",4
726,9/17/18 12:45,https://www.dailywire.com/news/35942/watch-kel...,WATCH: Kellyanne Conway Says Kavanaugh Accuser...,[],Daily Wire,With Supreme Court nominee Brett Kavanaugh bei...,False,False,280,"{'senate judiciary committee': 2, 'the whiteho...",4


# NER

In [10]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()


In [14]:
# Example
doc = nlp(df['article'][0])
print([(X.text, X.label_) for X in doc.ents])


[('Democrats', 'NORP'), ('Democrats', 'NORP'), ('Trump', 'ORG'), ('the Carter Page FISA Court', 'ORG'), ('Dianne Feinstein', 'PERSON'), ('the Washington Post', 'ORG'), ('Christine Blasey Ford', 'PERSON'), ('Supreme Court', 'ORG'), ('Brett Kavanaugh', 'PERSON'), ('Georgetown Prep', 'GPE'), ('Mark Judge', 'PERSON'), ('1982', 'DATE'), ('15 years old', 'DATE'), ('Kavanaugh', 'ORG'), ('2012', 'DATE'), ('Kavanaugh', 'PERSON'), ('four', 'CARDINAL'), ('Feinstein', 'PERSON'), ('only two', 'CARDINAL'), ('Blasey Ford', 'PERSON'), ('six years', 'DATE'), ('1982', 'DATE'), ('2012', 'DATE'), ('2012', 'DATE'), ('Washington Post', 'ORG'), ('Kavanaugh', 'PERSON'), ('Kavanaugh', 'GPE'), ('the Supreme Court', 'ORG'), ('Kavanaugh', 'ORG'), ('Supreme Court', 'ORG'), ('Feinstein', 'PERSON'), ('one', 'CARDINAL'), ('four', 'CARDINAL'), ('the year', 'DATE'), ('Patrick J. Smyth', 'PERSON'), ('Christine Blasey Ford', 'PERSON'), ('Washington', 'GPE'), ('Smyth', 'PRODUCT'), ('the Judiciary Committee', 'ORG'), ('tod

In [17]:
def ner_person(article):
    doc = nlp(article)
    os = [(X.text, X.label_) for X in doc.ents]
    results = {}
    for o in os:
        if o[1]=='PERSON':
            if o[0] in results:
                results[o[0]]+=1
            else:
                results[o[0]]=1
    return results

In [18]:
ner_person(df['article'][0])

{'Dianne Feinstein': 1,
 'Christine Blasey Ford': 2,
 'Brett Kavanaugh': 3,
 'Mark Judge': 1,
 'Kavanaugh': 3,
 'Feinstein': 3,
 'Blasey Ford': 1,
 'Patrick J. Smyth': 1,
 'Conrad Black': 1,
 'Andrew C. McCarthy': 1,
 'Kavanagh': 1,
 'McCarthy': 1}

In [19]:
df['people'] = df['article'].apply(lambda x: ner_person(x))

In [20]:
df.to_csv('resources/data/people_in_articles.csv', index=False)

In [21]:
df.head()

Unnamed: 0,publish_date,url,title,authors,media_site,article,issue,to_drop,word_count,people
0,9/20/18,https://www.americanthinker.com/articles/2018/...,Blasey Ford Must Be Acknowledged and Then Dism...,[],americanthinker.com,She released only selected portions of her the...,False,False,1266,"{'Dianne Feinstein': 1, 'Christine Blasey Ford..."
1,9/21/18,https://www.americanthinker.com/articles/2018/...,Dear Juanita Broaddrick,[],americanthinker.com,They want an FBI investigation of an individua...,False,False,969,"{'Brett Kavanaugh': 1, 'Christine Blasey Ford'..."
2,9/25/18,https://www.americanthinker.com/articles/2018/...,Debra Katz Was Wrong about Paula Jones's Case,[],americanthinker.com,Debra Katz Was Wrong about Paula Jones's Case\...,False,False,1444,"{'Debra Katz': 2, 'Paula Jones's': 1, 'Christi..."
3,9/28/18,https://www.americanthinker.com/articles/2018/...,Fake Rape Victims Are More Fun than Real Ones,[],americanthinker.com,Fake Rape Victims Are More Fun than Real Ones\...,False,False,1151,"{'Colin Kaepernik': 1, 'Christine Blasey Ford'..."
4,9/20/18,https://www.americanthinker.com/articles/2018/...,Ford vs. Kavanaugh: There's Nothing to Investi...,[],americanthinker.com,The problem with that is that there is nothing...,False,False,934,"{'Christine Blasey Ford's': 3, 'Christine Blas..."


In [22]:
# set of all unique names
holder = set()

for d in list(df['people']):
    for k,v in d.items():
        holder.add(k)

print(holder)



In [23]:
len(holder)

7862