In [3]:
import pandas as pd
import pickle
import sqlite3
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, TruncatedSVD

In [4]:
# Load data

articles1 = pd.read_csv("articles1.csv")
articles2 = pd.read_csv("articles2.csv")
articles3 = pd.read_csv("articles3.csv")
articles = pd.concat([articles1, articles2, articles3])

political_score_dict = {"Atlantic": -1,
                        "Breitbart": 1,
                        "Business Insider": -1,
                        "Buzzfeed News": -1,
                        "CNN": -1,
                        "Fox News": 1,
                        "Guardian": -1,
                        "NPR": -1,
                        "National Review": 1,
                        "New York Post": 1,
                        "New York Times": -1,
                        "Reuters": -1,
                        "Talking Points Memo": -1,
                        "Vox": -1,
                        "Washington Post": -1}

articles['score'] = articles['publication'].apply(lambda x: political_score_dict[x])
articles.head()

# This helps to cut down the volume of data I'm working with
articles_train, articles_test = train_test_split(articles, test_size=0.5)

uci = pd.read_csv("uci-news-aggregator.csv")
uci.columns = [k.lower() for k in uci.columns]

In [None]:
for pub in set(articles.publication):
    sub_df = articles[articles.publication == pub]
    print(pub)
    print(len(sub_df))
    

In [84]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [112]:
articles_train.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,score
28337,81796,123240,Model sues ex-sugar daddy for kicking her out ...,New York Post,Julia Marsh and Laura Italiano,2017-01-13,2017.0,1.0,https://web.archive.org/web/20170115000208/htt...,A foxy Russian model is suing a married deve...,1
14897,68305,96125,Harry Reid To Republicans: You Better Not Bloc...,Talking Points Memo,,2016-02-13,2016.0,2.0,https://web.archive.org/web/20160214013155/htt...,Senate Minority Leader Harry Reid ( ) released...,-1
39140,142602,213716,How Monday’s train attack by a 17-year-old Afg...,Washington Post,Rick Noack,2016-07-19,2016.0,7.0,https://web.archive.org/web/20160720000114/htt...,Germany has so far been spared a Isla...,-1
38038,91497,136413,"She took down a mugger with MMA, then audition...",New York Post,"Carl Stroud, The Sun",2016-06-02,2016.0,6.0,http://nypost.com/2016/06/02/she-took-down-a-m...,A beauty queen in high heels fought off a mugg...,1
3973,3973,21691,N.C.A.A. Moves Championship Events From North ...,New York Times,Marc Tracy and Alan Blinder,2017-03-08,2017.0,3.0,,The N. C. A. A. responding to a contentious No...,-1


##  Do the topics seem different / are they discussing different things?

###  Do topic modeling for each publication segment, compare.

In [14]:
def get_pub_dict():
    pub_dict = {}

    for pub in set(articles_train.publication):
        pub_df = articles_train[articles_train.publication == pub]
        pub_data = zip(pub_df.date, pub_df.title, pub_df.content)
        pub_dict[pub] = pub_data
        
    return pub_dict    


def get_topics(model, feature_names, no_top_words):
    topics = []
    for _, topic in enumerate(model.components_):
        topics.append([(feature_names[i], np.round(lsa_cv.components_[_][i], 3)) for i in topic.argsort()[:-no_top_words-1:-1]])
    return topics

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   min_df = 0.05,
                                   max_df = 0.6)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   min_df = 0.05,
                                   max_df = 0.6)

In [15]:
pub_dict

{'Business Insider': <zip at 0x1a0dd4cf48>,
 'CNN': <zip at 0x1a0dd559c8>,
 'Fox News': <zip at 0x1a0dd55488>,
 'Vox': <zip at 0x1a0dd55088>,
 'Talking Points Memo': <zip at 0x1a0dd553c8>,
 'Buzzfeed News': <zip at 0x1a0dd55a88>,
 'New York Times': <zip at 0x1a0dd55f88>,
 'National Review': <zip at 0x1a0dd4ca48>,
 'Guardian': <zip at 0x1a0dd61408>,
 'Atlantic': <zip at 0x1a0dd61208>,
 'NPR': <zip at 0x1a0dd614c8>,
 'Breitbart': <zip at 0x1a0dd55f48>,
 'Washington Post': <zip at 0x1a0dd66dc8>,
 'New York Post': <zip at 0x1a0dd4c088>,
 'Reuters': <zip at 0x1a0dd66fc8>}

In [72]:
n_comp = 10
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

In [73]:
num_top_words = 10
pub_dict = get_pub_dict()
publications = pub_dict.keys()

topic_pub_dict = {}

for pub in publications:
    pub_articles = [k[2] for k in pub_dict[pub]]
    
    cv_articles_data = count_vectorizer.fit_transform(pub_articles)
    tfidf_articles_data = tfidf_vectorizer.fit_transform(pub_articles)
    
    lsa_tfidf_articles_data = lsa_tfidf.fit_transform(tfidf_articles_data)
    lsa_cv_articles_data = lsa_cv.fit_transform(cv_articles_data)
    nmf_cv_articles_data = nmf_cv.fit_transform(cv_articles_data)
    
    pub_lsa_tfidf = get_topics(lsa_tfidf, tfidf_vectorizer.get_feature_names(), num_top_words)
    pub_lsa_cv    = get_topics(lsa_cv,    count_vectorizer.get_feature_names(), num_top_words)
    pub_nmf_cv    = get_topics(nmf_cv,    count_vectorizer.get_feature_names(), num_top_words)
    
    topic_pub_dict[pub] = [pub_lsa_tfidf, pub_lsa_cv, pub_nmf_cv]
    
    
pub_df = pd.DataFrame.from_dict(topic_pub_dict, orient="index")   
pub_df.columns = ["lsa_tfidf", "lsa_cv", "nmf_cv"]

In [74]:
pub_df

Unnamed: 0,lsa_tfidf,lsa_cv,nmf_cv
New York Times,"[[(trump, 0.586), (mr trump, 0.366), (ms, 0.10...","[[(trump, 0.586), (mr trump, 0.366), (presiden...","[[(trump, 0.586), (mr trump, 0.366), (presiden..."
New York Post,"[[(trump, 0.193), (new, 0.25), (says, 0.185), ...","[[(new, 0.25), (like, 0.236), (just, 0.229), (...","[[(game, 0.09), (season, 0.081), (team, 0.082)..."
National Review,"[[(trump, 0.6), (clinton, 0.171), (obama, 0.14...","[[(trump, 0.6), (clinton, 0.171), (president, ...","[[(trump, 0.6), (donald, 0.088), (donald trump..."
Reuters,"[[(trump, 0.422), (percent, 0.217), (billion, ...","[[(trump, 0.422), (percent, 0.217), (president...","[[(trump, 0.422), (president, 0.188), (campaig..."
NPR,"[[(trump, 0.361), (said, 0.322), (president, 0...","[[(trump, 0.361), (said, 0.322), (president, 0...","[[(think, 0.144), (know, 0.126), (going, 0.135..."
Business Insider,"[[(trump, 0.414), (said, 0.393), (president, 0...","[[(trump, 0.414), (said, 0.393), (people, 0.26...","[[(people, 0.265), (think, 0.17), (like, 0.213..."
Talking Points Memo,"[[(trump, 0.692), (president, 0.206), (house, ...","[[(trump, 0.692), (president, 0.206), (people,...","[[(trump, 0.692), (donald, 0.087), (donald tru..."
CNN,"[[(trump, 0.609), (president, 0.186), (people,...","[[(trump, 0.609), (people, 0.197), (president,...","[[(trump, 0.609), (campaign, 0.11), (donald, 0..."
Buzzfeed News,"[[(trump, 0.212), (like, 0.253), (buzzfeed, 0....","[[(like, 0.253), (trump, 0.212), (just, 0.196)...","[[(like, 0.253), (just, 0.196), (says, 0.093),..."
Guardian,"[[(trump, 0.415), (like, 0.192), (new, 0.176),...","[[(trump, 0.415), (like, 0.192), (just, 0.179)...","[[(like, 0.192), (just, 0.179), (don, 0.114), ..."


In [120]:
def get_all_lsa_cv(topic_pub_dict):
    def flatten(list_of_lists):
        return [k for sublist in list_of_lists for k in sublist]
    
    all_lsa_cv = []
    for key in topic_pub_dict.keys():
        pub_topics = topic_pub_dict[key][1]
        pub_topics = flatten([k for k in pub_topics])
        topics = [k[0] for k in pub_topics]
        all_lsa_cv += topics
        
    return all_lsa_cv


all_lsa_cv = get_all_lsa_cv(topic_pub_dict)

def get_lsa_cv_word_freq(all_lsa_cv):
    word_freq = []
    for s in set(articles.score):
        sub_articles_train = articles_train[articles_train.score == s]
        all_content = " ".join(sub_articles_train.content)
        for term in all_lsa_cv:
            word_freq.append((term, s, all_content.count(term)))
    return word_freq


lsa_cv_word_freq = get_lsa_cv_word_freq(all_lsa_cv)

In [121]:
import pickle
with open("lsa_cv_word_freq.pickle", "wb") as f:
    pickle.dump(lsa_cv_word_freq, f)

In [101]:
articles_train.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
21839,75248,115206,Round Two for the Supreme Court?,National Review,John Fund,2017-02-03,2017.0,2.0,http://www.nationalreview.com/article/444571/d...,Donald Trump is known for his bluster and brag...
19703,123165,183516,Japan’s Universal accuses founder Okada of imp...,Reuters,Nathan Layne,2017-06-09,2017.0,6.0,http://www.reuters.com/article/us-universal-en...,In a statement on Universal’s website issued u...
15632,119094,175150,"In New York, Activists Prepare Bystanders To T...",NPR,Hansi Lo Wang,2016-12-22,2016.0,12.0,http://www.npr.org/2016/12/22/506583208/in-new...,If you were to witness a attack or a hate cr...
35427,138889,208982,FBI tries to figure out what San Bernardino at...,Washington Post,Mark Berman,2016-01-05,2016.0,1.0,https://web.archive.org/web/20160106000201/htt...,The SUV involved in the police shootout w...
34439,137901,207732,The definitive book about the Trump administra...,Washington Post,Richard Cohen,2017-05-22,2017.0,5.0,https://web.archive.org/web/20170523000535/htt...,"Back in 1951, Herman Wouk published the defin..."


###  Could / should I also compare with topic modeling for entire corpus?

###  Can I make a bubble plot for each publication?

In [9]:
def flatten(list_of_lists):
    return [k for sublist in list_of_lists for k in sublist]


def get_distinct_topics(model):
    distinct_topics = {}
    pub_topics = list(pub_df[model].items())
    n = len(pub_topics)
    for k in range(n):
        publication = pub_topics[k][0]
        flat_topics = flatten(pub_topics[k][1])
        unique_flat_topics = set(flat_topics)
        distinct_topics[publication] = unique_flat_topics
    return distinct_topics    

In [60]:
get_distinct_topics('lsa_tfidf')

##  Are average sentence length or average article length indicative of political opinion?

###  Calculate average sentence count / article
###  Calculate average words / article

In [37]:
def get_num_sentences(article):
    stoppers = [".", "!", "?"]
    num_sentences = 0
    for k in stoppers:
        num_sentences += article.count(k)
    return num_sentences
        
        
def get_num_words(article):
    return len(article.split(" "))


def get_avg_word_length(article):
    fillers = list(".,!?:'-()/")
    for k in fillers:
        article = article.replace(k, "")
    article = article.replace('"', '')    
    all_words = article.split(" ")
    avg_word_length = np.average([len(k) for k in all_words])
    avg_word_length = np.round(avg_word_length, 1)
    return avg_word_length
    
    
def get_adjective_count(article):
    data = nltk.word_tokenize(article)
    categories = nltk.pos_tag(data)
    return len([k[1] for k in example if "JJ" in k])

In [40]:
avg_words_sentences_dict = {}
pub_dict = get_pub_dict()

for pub in publications:
    pub_articles = [k[2] for k in pub_dict[pub]]
    
    avg_sentences = np.average([get_num_sentences(k) for k in pub_articles])
    avg_sentences = np.round(avg_sentences, 1)
    
    avg_words = np.average([get_num_words(k) for k in pub_articles])
    avg_words = np.round(avg_words, 1)
    
    avg_word_length = np.average([get_avg_word_length(k) for k in pub_articles])
    avg_word_length = np.round(avg_word_length, 1)
    
    avg_words_sentences_dict[pub] = (avg_sentences, avg_words, avg_word_length)
    
    
    
    

In [41]:
avg_words_sentences_dict

{'Reuters': (37.0, 694.1, 4.7),
 'Guardian': (45.1, 936.9, 4.6),
 'Vox': (65.5, 1445.3, 4.4),
 'Washington Post': (59.5, 1081.1, 4.5),
 'Fox News': (30.4, 539.4, 4.6),
 'New York Times': (76.0, 1192.9, 4.6),
 'CNN': (39.5, 743.9, 4.6),
 'NPR': (46.8, 799.5, 4.6),
 'National Review': (50.2, 977.2, 4.7),
 'New York Post': (25.0, 464.0, 4.4),
 'Buzzfeed News': (44.7, 917.5, 4.6),
 'Atlantic': (69.2, 1370.8, 4.6),
 'Talking Points Memo': (21.0, 377.4, 4.7),
 'Breitbart': (27.2, 525.9, 4.7),
 'Business Insider': (23.8, 533.4, 3.9)}

In [31]:
list(k)

[]

In [9]:
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,score
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,-1
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...",-1
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...",-1
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...",-1
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...",-1


### Sentiment

In [7]:
from textblob import TextBlob

In [8]:
pub_dict = get_pub_dict()
sentiment_polarity_dict = {}

for pub in pub_dict:
    sub_articles = articles_train[articles_train.publication == pub]
    sub_content = list(sub_articles.content)
    sub_sentiment = [TextBlob(k).sentiment for k in sub_content]
    avg_polarity = np.average([k[0] for k in sub_sentiment])
    avg_polarity = np.round(avg_polarity, 3)
    avg_subjectivity = np.average([k[1] for k in sub_sentiment])
    avg_subjectivity = np.round(avg_sentiment, 3)
    subjectivity_polarity_dict[pub] = (avg_polarity, avg_subjectivity)

In [9]:
title_subjectivity_polarity_dict = {}
for pub in pub_dict:
    pub_entries = list(pub_dict[pub])
    pub_titles = [k[1] for k in pub_entries]

    title_subjectivitysentiment = [TextBlob(k).sentiment for k in pub_titles if type(k) == str]
    avg_polarity = np.average([k[0] for k in title_sentiment])
    avg_polarity = np.round(avg_polarity, 3)
    avg_subjectivity = np.average([k[1] for k in title_sentiment])
    avg_subjectivity = np.round(avg_sentiment, 3)
    title_subjectivity_polarity_dict[pub] = (avg_polarity, avg_sentiment)

In [19]:
uci_title_subjectivity_polarity = []

uci_titles = list(uci.title)
for title in uci_titles:
    uci_title_subjectivity_polarity.append(TextBlob(title).sentiment)


subjectivity = [k[0] for k in uci_title_sentiment_polarity]
polarity = [k[1] for k in uci_title_sentiment_polarity]

avg_uci_subjectivity = np.average(sentiment)
avg_uci_polarity = np.average(polarity)

In [24]:
title_sentiment_polarity_dict

{'Breitbart': (0.006, 0.233),
 'CNN': (0.019, 0.203),
 'Atlantic': (0.023, 0.188),
 'Vox': (0.05, 0.322),
 'Buzzfeed News': (0.026, 0.251),
 'National Review': (0.016, 0.193),
 'Reuters': (0.022, 0.196),
 'Talking Points Memo': (0.03, 0.23),
 'NPR': (0.048, 0.254),
 'New York Post': (0.02, 0.272),
 'Washington Post': (0.022, 0.284),
 'Guardian': (0.033, 0.261),
 'New York Times': (0.111, 0.443),
 'Fox News': (0.02, 0.22),
 'Business Insider': (0.067, 0.344)}

In [22]:
avg_uci_sentiment

0.04286047509439413

In [23]:
avg_uci_polarity

0.2252681737036944

In [13]:
sentiment_polarity_dict

{'Business Insider': (0.086, 0.418),
 'CNN': (0.077, 0.414),
 'Fox News': (0.077, 0.421),
 'Vox': (0.095, 0.452),
 'Talking Points Memo': (0.074, 0.413),
 'Buzzfeed News': (0.074, 0.42),
 'New York Times': (0.08, 0.414),
 'National Review': (0.082, 0.446),
 'Guardian': (0.079, 0.43),
 'Atlantic': (0.092, 0.427),
 'NPR': (0.095, 0.43),
 'Breitbart': (0.066, 0.419),
 'Washington Post': (0.079, 0.426),
 'New York Post': (0.078, 0.429),
 'Reuters': (0.06, 0.382)}

In [15]:
import spacy
import random


def get_adj_percentage(text):
    doc = nlp(text)
    adj_subset = [k for k in doc if k.pos_ == "ADJ"]
    num_adjs = len(adj_subset)
    num_tokens = len(doc)
    perc = (num_adjs / num_tokens)
    return np.round(perc, 3) * 100


nlp = spacy.load("en_core_web_sm")
pub_dict = get_pub_dict()
adj_pub_count = {}

for pub in pub_dict:
    adj_count = 0
    pub_articles = articles[articles.publication == pub]
    pub_articles = list(pub_articles.content)
    random.shuffle(pub_articles)
    #  Picking a subset should be fine
    sub_pub_articles = pub_articles[:100]
    avg_adjs = np.average([get_adj_percentage(k) for k in sub_pub_articles])
    adj_pub_count[pub] = avg_adjs


In [16]:
adj_pub_count

{'NPR': 7.655999999999999,
 'National Review': 9.21,
 'Washington Post': 7.899,
 'Talking Points Memo': 6.824,
 'Vox': 8.654,
 'Business Insider': 6.657,
 'CNN': 7.265,
 'Buzzfeed News': 7.134999999999999,
 'Fox News': 7.206,
 'New York Times': 7.837999999999999,
 'Reuters': 7.735,
 'New York Post': 7.404,
 'Breitbart': 7.086,
 'Guardian': 8.034999999999998,
 'Atlantic': 8.405999999999999}

In [10]:
import numpy as np