In [1]:
import pandas as pd
import pickle
import sqlite3
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
# Load data

articles1 = pd.read_csv("articles1.csv")
articles2 = pd.read_csv("articles2.csv")
articles3 = pd.read_csv("articles3.csv")
articles = pd.concat([articles1, articles2, articles3])

# This helps to cut down the volume of data I'm working with
articles_train, articles_test = train_test_split(articles, test_size=0.5)

uci = pd.read_csv("uci-news-aggregator.csv")
uci.columns = [k.lower() for k in uci.columns]

##  Question 1: Do topic modeling on articles - do the topics seem different / are they discussing different things?

###  Do topic modeling for each publication segment, compare.

In [57]:
def get_pub_dict():
    pub_dict = {}

    for pub in set(articles_train.publication):
        pub_df = articles_train[articles_train.publication == pub]
        pub_data = zip(pub_df.date, pub_df.title, pub_df.content)
        pub_dict[pub] = pub_data
        
    return pub_dict    


def get_topics(model, feature_names, no_top_words):
    topics = []
    for _, topic in enumerate(model.components_):
        topics.append([(feature_names[i], np.round(lsa_cv.components_[_][i], 3)) for i in topic.argsort()[:-no_top_words-1:-1]])
    return topics

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   min_df = 0.05,
                                   max_df = 0.6)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   min_df = 0.05,
                                   max_df = 0.6)

In [72]:
n_comp = 10
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

In [73]:
num_top_words = 10
pub_dict = get_pub_dict()
publications = pub_dict.keys()

topic_pub_dict = {}

for pub in publications:
    pub_articles = [k[2] for k in pub_dict[pub]]
    
    cv_articles_data = count_vectorizer.fit_transform(pub_articles)
    tfidf_articles_data = tfidf_vectorizer.fit_transform(pub_articles)
    
    lsa_tfidf_articles_data = lsa_tfidf.fit_transform(tfidf_articles_data)
    lsa_cv_articles_data = lsa_cv.fit_transform(cv_articles_data)
    nmf_cv_articles_data = nmf_cv.fit_transform(cv_articles_data)
    
    pub_lsa_tfidf = get_topics(lsa_tfidf, tfidf_vectorizer.get_feature_names(), num_top_words)
    pub_lsa_cv    = get_topics(lsa_cv,    count_vectorizer.get_feature_names(), num_top_words)
    pub_nmf_cv    = get_topics(nmf_cv,    count_vectorizer.get_feature_names(), num_top_words)
    
    topic_pub_dict[pub] = [pub_lsa_tfidf, pub_lsa_cv, pub_nmf_cv]
    
    
pub_df = pd.DataFrame.from_dict(topic_pub_dict, orient="index")   
pub_df.columns = ["lsa_tfidf", "lsa_cv", "nmf_cv"]

In [74]:
pub_df

Unnamed: 0,lsa_tfidf,lsa_cv,nmf_cv
New York Times,"[[(trump, 0.586), (mr trump, 0.366), (ms, 0.10...","[[(trump, 0.586), (mr trump, 0.366), (presiden...","[[(trump, 0.586), (mr trump, 0.366), (presiden..."
New York Post,"[[(trump, 0.193), (new, 0.25), (says, 0.185), ...","[[(new, 0.25), (like, 0.236), (just, 0.229), (...","[[(game, 0.09), (season, 0.081), (team, 0.082)..."
National Review,"[[(trump, 0.6), (clinton, 0.171), (obama, 0.14...","[[(trump, 0.6), (clinton, 0.171), (president, ...","[[(trump, 0.6), (donald, 0.088), (donald trump..."
Reuters,"[[(trump, 0.422), (percent, 0.217), (billion, ...","[[(trump, 0.422), (percent, 0.217), (president...","[[(trump, 0.422), (president, 0.188), (campaig..."
NPR,"[[(trump, 0.361), (said, 0.322), (president, 0...","[[(trump, 0.361), (said, 0.322), (president, 0...","[[(think, 0.144), (know, 0.126), (going, 0.135..."
Business Insider,"[[(trump, 0.414), (said, 0.393), (president, 0...","[[(trump, 0.414), (said, 0.393), (people, 0.26...","[[(people, 0.265), (think, 0.17), (like, 0.213..."
Talking Points Memo,"[[(trump, 0.692), (president, 0.206), (house, ...","[[(trump, 0.692), (president, 0.206), (people,...","[[(trump, 0.692), (donald, 0.087), (donald tru..."
CNN,"[[(trump, 0.609), (president, 0.186), (people,...","[[(trump, 0.609), (people, 0.197), (president,...","[[(trump, 0.609), (campaign, 0.11), (donald, 0..."
Buzzfeed News,"[[(trump, 0.212), (like, 0.253), (buzzfeed, 0....","[[(like, 0.253), (trump, 0.212), (just, 0.196)...","[[(like, 0.253), (just, 0.196), (says, 0.093),..."
Guardian,"[[(trump, 0.415), (like, 0.192), (new, 0.176),...","[[(trump, 0.415), (like, 0.192), (just, 0.179)...","[[(like, 0.192), (just, 0.179), (don, 0.114), ..."


In [75]:
topic_pub_dict["Atlantic"][1]

[[('trump', 0.838),
  ('president', 0.244),
  ('business', 0.088),
  ('organization', 0.087),
  ('election', 0.079),
  ('clinton', 0.068),
  ('donald', 0.068),
  ('campaign', 0.067),
  ('company', 0.062),
  ('donald trump', 0.06)],
 [('think', 0.167),
  ('way', 0.152),
  ('don', 0.151),
  ('clinton', 0.136),
  ('know', 0.133),
  ('reader', 0.126),
  ('did', 0.123),
  ('work', 0.119),
  ('ve', 0.117),
  ('make', 0.113)],
 [('clinton', 0.346),
  ('trump', 0.193),
  ('republican', 0.192),
  ('hillary', 0.174),
  ('donald', 0.168),
  ('donald trump', 0.165),
  ('vote', 0.16),
  ('hillary clinton', 0.15),
  ('nominee', 0.147),
  ('gop', 0.097)],
 [('reader', 0.231),
  ('think', 0.141),
  ('know', 0.098),
  ('trump', 0.098),
  ('really', 0.088),
  ('way', 0.087),
  ('don', 0.081),
  ('ve', 0.08),
  ('song', 0.073),
  ('things', 0.071)],
 [('obama', 0.232),
  ('states', 0.183),
  ('president', 0.18),
  ('clinton', 0.166),
  ('war', 0.164),
  ('united', 0.156),
  ('american', 0.154),
  ('world

###  Could / should I also compare with topic modeling for entire corpus?

###  Can I make a bubble plot for each publication?

In [9]:
def flatten(list_of_lists):
    return [k for sublist in list_of_lists for k in sublist]


def get_distinct_topics(model):
    distinct_topics = {}
    pub_topics = list(pub_df[model].items())
    n = len(pub_topics)
    for k in range(n):
        publication = pub_topics[k][0]
        flat_topics = flatten(pub_topics[k][1])
        unique_flat_topics = set(flat_topics)
        distinct_topics[publication] = unique_flat_topics
    return distinct_topics    

In [60]:
get_distinct_topics('lsa_tfidf')

##  Question 2: Are average sentence length or average article length indicative of political opinion?

###  Calculate average sentence count / article
###  Calculate average words / article

In [37]:
def get_num_sentences(article):
    stoppers = [".", "!", "?"]
    num_sentences = 0
    for k in stoppers:
        num_sentences += article.count(k)
    return num_sentences
        
        
def get_num_words(article):
    return len(article.split(" "))


def get_avg_word_length(article):
    fillers = list(".,!?:'-()/")
    for k in fillers:
        article = article.replace(k, "")
    article = article.replace('"', '')    
    all_words = article.split(" ")
    avg_word_length = np.average([len(k) for k in all_words])
    avg_word_length = np.round(avg_word_length, 1)
    return avg_word_length
    
    
def get_adjective_count(article):
    data = nltk.word_tokenize(article)
    categories = nltk.pos_tag(data)
    return len([k[1] for k in example if "JJ" in k])

In [40]:
avg_words_sentences_dict = {}
pub_dict = get_pub_dict()

for pub in publications:
    pub_articles = [k[2] for k in pub_dict[pub]]
    
    avg_sentences = np.average([get_num_sentences(k) for k in pub_articles])
    avg_sentences = np.round(avg_sentences, 1)
    
    avg_words = np.average([get_num_words(k) for k in pub_articles])
    avg_words = np.round(avg_words, 1)
    
    avg_word_length = np.average([get_avg_word_length(k) for k in pub_articles])
    avg_word_length = np.round(avg_word_length, 1)
    
    avg_words_sentences_dict[pub] = (avg_sentences, avg_words, avg_word_length)
    
    
    
    

In [41]:
avg_words_sentences_dict

{'Reuters': (37.0, 694.1, 4.7),
 'Guardian': (45.1, 936.9, 4.6),
 'Vox': (65.5, 1445.3, 4.4),
 'Washington Post': (59.5, 1081.1, 4.5),
 'Fox News': (30.4, 539.4, 4.6),
 'New York Times': (76.0, 1192.9, 4.6),
 'CNN': (39.5, 743.9, 4.6),
 'NPR': (46.8, 799.5, 4.6),
 'National Review': (50.2, 977.2, 4.7),
 'New York Post': (25.0, 464.0, 4.4),
 'Buzzfeed News': (44.7, 917.5, 4.6),
 'Atlantic': (69.2, 1370.8, 4.6),
 'Talking Points Memo': (21.0, 377.4, 4.7),
 'Breitbart': (27.2, 525.9, 4.7),
 'Business Insider': (23.8, 533.4, 3.9)}

In [31]:
list(k)

[]

### Calculate average adjectives / article

###   Cluster!
