# Analyzing Bias In News

Data Source: https://www.kaggle.com/snapcrack/all-the-news

Leaning Classification: https://www.adfontesmedia.com/¶

## Import Packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
files = ['/Users/carolynwang/Desktop/PathMentors/Mayuka/Newspapers/articles1.csv', 
        '/Users/carolynwang/Desktop/PathMentors/Mayuka/Newspapers/articles2.csv',
        '/Users/carolynwang/Desktop/PathMentors/Mayuka/Newspapers/articles3.csv']
df = pd.concat([pd.read_csv(f) for f in files], ignore_index = True) 

In [2]:
df.head(3)

NameError: name 'df' is not defined

## Define Functions

Stopword Filter

In [4]:
#takes in text, returns text with stopwords filtered out
def stopword_filter(content):
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
    filtered_sentence = []
    if isinstance(content, list):
        for w in content:
            if w not in stop_words:
                filtered_sentence.append(w)
    return filtered_sentence

Word Finder

In [5]:
#takes in text, character needed, and boolean (ignore casing or not)
#returns number of characters found in text 
def regex_finder(content, character, ignore_case):
    if (ignore_case):
        return len(re.findall(character, content, re.I))
    else:
        return len(re.findall(character, content))

POS Proportion

In [6]:
#takes in pos
#takes in text that has been POS tagged
#returns frequency of appearance of that particular pos (ex: 0.09 for adjectives)
def pos_proportion(content, pos):
    pos_list = []
    total = [] #all the words
    if isinstance(content, list):
        for a,b in content:
            if b == pos:
                pos_list.append(a)
            total.append(a)
    return_num = len(pos_list) / len(total) if len(total) != 0 else 0
    return return_num

POS Words

In [7]:
#takes in text that has been POS tagged
#returns list of word with a particular pos (such as all adjectives)
def pos_list(content, pos):
    pos_list = []
    total = [] #all the words
    if isinstance(content, list):
        for a,b in content:
            if b == pos:
                pos_list.append(a)
            total.append(a)
    return pos_list

In [8]:
#dict classifies news outlets as left or right
#left = 1, right = 0
my_dict = {'Atlantic': 1,
           'Breitbart': 0,
           'Business Insider': 1,
           'Buzzfeed News': 1,
           'CNN': 1,
           'Fox News': 0,
           'Guardian': 1,
           'NPR': 1,
           'National Review': 0,
           'New York Post': 0,
           'New York Times': 1,
           'Reuters': 1,
           'Talking Points Memo': 1,
           'Vox': 1,
           'Washington Post': 1,
}

In [9]:
#takes in publication
#retuns L or R classification
def mapping(publication):
    if publication in my_dict.keys():
        return my_dict[publication]
    else:
        return "na"

Word Exists

In [10]:
#takes in content
#returns boolean denoting whether x from word_in)stuff exists in article?
def word_exists(x, list_of_stuff):
    return any(word in x for word in list_of_stuff)
    
    '''
    b = false
    for a in list_of_stuff:
        if (a in x):
            b = True
    return b'''

Word Count

In [11]:
#takes in list of words (x)
#takes in content
#returns number of times that the words in list x appear in the string content
def word_count(x, content):
    count = 0
    for a in x:
        count += len(re.findall(a, content, flags = re.IGNORECASE))
    return count

Precision, Recall, & Overall Accuracy Calculator

In [12]:
#prints out accuracy
def accuracy(confusion_matrix):
    TP = confusion_matrix[0,0]
    FN = confusion_matrix[0,1]
    TN = confusion_matrix[1,1]
    FP = confusion_matrix[1,0]
    print("Overall Accuracy: " + str((TP + TN) / (TP + FN + TN + FP)))
    print('Precision: ' + str(TP / (TP + FP)))
    print('Recall: ' + str(TP / (TP + FN)))
   #return TP / (TP + FP) if accuracy_type == 'precision' else TP / (TP + FN) if accuracy_type == 'recall' else (TP + TN) / (TP + FN + TN + FP)

In [13]:
#overall accuracy
def overall_accuracy(confusion_matrix):
    TP = confusion_matrix[0,0]
    FN = confusion_matrix[0,1]
    TN = confusion_matrix[1,1]
    FP = confusion_matrix[1,0] 
    return (TP + TN) / (TP + FN + TN + FP)

In [14]:
#this function assumes the dataframe has a column with the same name as the POS
# input: dataframe, pos, optional parameter: take out vocab with greater than "n" length
# create list of words of 'pos' -- iterate through each 'pos' column to take out the vocab with>=5
# output -- list of vocab of a certain pos
def list_of_vocab(df, pos, word_length = 0):
    to_be_put_in_cv = list(itertools.chain.from_iterable(df[pos]))
    list_of_vocab = [x for x in to_be_put_in_cv if len(x)>=word_length]
    return list_of_vocab

In [15]:
# input: list_of_vocab, dataset, n words
# create count vectorizer object
# list of vocab, 
# output: array with the top n words
def top_n_words_array(list_of_vocab, df, column_name = 'content_clean', n = 50):
    vectorizer = CountVectorizer(vocabulary=set(list_of_vocab), token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(df[column_name].values)
    vocab_array = X.toarray()
    vocab_array_dataframe = pd.DataFrame(vocab_array, columns=vectorizer.get_feature_names())
    word_frequency_agg = pd.DataFrame(vocab_array_dataframe.sum()).reset_index()
    word_frequency_agg.columns = ['word','frequency']
    word_frequency_agg = word_frequency_agg.sort_values('frequency')
    top_n_frequent_words = word_frequency_agg.tail(n)['word']
    return top_n_frequent_words

In [16]:
# input: array with the top n words, dataset
# use ^^ array of top n words as vocab for the next CountVectorizer object
# output: CountVectorizer array
def final_cv_array(top_n_words_array, df, column_name = 'content_clean'):
    vectorizer = CountVectorizer(vocabulary=top_n_words_array, min_df=0, stop_words=frozenset(),token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(df[column_name].values)
    CV_array = X.toarray()
    return CV_array

## Split into Training, Validation, & Testing

Randomize indexes in dataframe

In [17]:
import numpy as np
df_scrambled = df.sample(frac = 1)

Create three datasets

In [18]:
training = df_scrambled.iloc[0:90000, [2,3,4,5,6,7,9]]
validation = df_scrambled.iloc[90001:135000, [2,3,4,5,6,7,9]]
testing = df_scrambled.iloc[135001:150000, [2,3,4,5,6,7,9]]

In [19]:
from random import randint
training['dataset'] = 'training'
validation['dataset'] = 'validation'
testing['dataset'] = 'testing'
training.head(2)

df = pd.concat([training, validation, testing])
df.head(3)

Unnamed: 0,title,publication,author,date,year,month,content,dataset
79873,Tourists are getting pulled over for staring a...,New York Post,Lauren Tousignant,2017-02-15,2017.0,2.0,They’re drunk off a natural phenomenon. Icelan...,training
14533,"Google, Facebook, Twitter Promise to Crack Dow...",Breitbart,Jack Hadfield,2017-04-01,2017.0,4.0,"In a meeting with Amber Rudd, the British Home...",training
19786,Judge Jeanine: The Establishment Trying to Tak...,Breitbart,Trent Baker,2016-04-09,2016.0,4.0,On the Saturday broadcast of “Justice” on Fox ...,training


## Leaning

In [20]:
df['leaning'] = df['publication'].apply(lambda x: mapping(x))

In [21]:
df.head(1)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning
79873,Tourists are getting pulled over for staring a...,New York Post,Lauren Tousignant,2017-02-15,2017.0,2.0,They’re drunk off a natural phenomenon. Icelan...,training,0


## Balance Dataset

In [22]:
df = df.sample(frac = 1)

In [23]:
df.head(3)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning
42827,Bryan Cranston said what?! Five Comic-Con OMG ...,CNN,Henry Hanks,2015-07-12,2015.0,7.0,"(CNN) Every year, studios, networks and publi...",training,1
2798,‘The Angry Birds Movie’ Nests Atop the Box Off...,New York Times,Brooks Barnes,2016-05-25,2016.0,5.0,"An omnipresent, marketing campaign propelled...",training,1
116357,Citigroup fined $28.8 million for harm to home...,Reuters,Lisa Lambert,2017-01-24,2017.0,1.0,Citigroup Inc ( ) mortgage units have been fi...,validation,1


In [24]:
pwords = ['trump', 'biden', 'democrat', 'republican', 'left', 'right', 'wing', 'white house', 'election', 'voter',
'clinton', 'bush', 'president','democracy']

In [25]:
df['is_political'] = df['content'].apply(lambda x: word_exists(x, pwords))

In [26]:
df = df[df.is_political == True]

In [27]:
df_dem = df[df.leaning == 0].head(37121)
df_rep = df[df.leaning == 1].head(37121)
df = pd.concat([df_dem, df_rep]).sample(frac = 1)

In [28]:
#political articles filtered out with equal number of dem & rep publications
df.groupby('leaning').count().reset_index()

Unnamed: 0,leaning,title,publication,author,date,year,month,content,dataset,is_political
0,0,37121,37121,34768,37121,37121,37121,37121,37121,37121
1,1,37120,37121,32613,36266,36266,36266,37121,37121,37121


## Sentiment

In [29]:
from textblob import TextBlob

In [30]:
df['sentiment'] = df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)

## Lowercase Content

In [31]:
df['content_lowercase'] = df['content'].str.lower()
df.head(2)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning,is_political,sentiment,content_lowercase
94841,Milwaukee ‘in turmoil’ as protesters riot afte...,New York Post,Sophia Rosenbaum,2016-08-14,2016.0,8.0,Milwaukee was burning Saturday following an ...,training,0,True,-0.154545,milwaukee was burning saturday following an ...
142565,An eavesdropping Uber driver saved his 16-year...,Washington Post,Avi Selk,2016-12-30,2016.0,12.0,Uber driver Keith Avila picked up a p...,testing,1,True,0.074346,uber driver keith avila picked up a p...


## Lemmatize Text

In [32]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [33]:
from nltk.tokenize import word_tokenize
df['lem_content'] = df['content_lowercase'].apply(lambda x:[lem.lemmatize(y, 'n') for y in word_tokenize(x) if y.isalnum()])

## Remove Stopwords

In [34]:
df['stopwords_removed'] = df['lem_content'].apply(lambda x: stopword_filter(x))

In [35]:
df.head(1)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning,is_political,sentiment,content_lowercase,lem_content,stopwords_removed
94841,Milwaukee ‘in turmoil’ as protesters riot afte...,New York Post,Sophia Rosenbaum,2016-08-14,2016.0,8.0,Milwaukee was burning Saturday following an ...,training,0,True,-0.154545,milwaukee was burning saturday following an ...,"[milwaukee, wa, burning, saturday, following, ...","[milwaukee, wa, burning, saturday, following, ..."


## Clean Content

In [36]:
def list_to_string(mylist):
    new_list = ''
    for element in mylist:
        new_list += element + ' '
    return new_list   

df['content_clean'] = df['stopwords_removed'].apply(lambda x: list_to_string(x))

In [37]:
df.head(3)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning,is_political,sentiment,content_lowercase,lem_content,stopwords_removed,content_clean
94841,Milwaukee ‘in turmoil’ as protesters riot afte...,New York Post,Sophia Rosenbaum,2016-08-14,2016.0,8.0,Milwaukee was burning Saturday following an ...,training,0,True,-0.154545,milwaukee was burning saturday following an ...,"[milwaukee, wa, burning, saturday, following, ...","[milwaukee, wa, burning, saturday, following, ...",milwaukee wa burning saturday following shooti...
142565,An eavesdropping Uber driver saved his 16-year...,Washington Post,Avi Selk,2016-12-30,2016.0,12.0,Uber driver Keith Avila picked up a p...,testing,1,True,0.074346,uber driver keith avila picked up a p...,"[uber, driver, keith, avila, picked, up, a, pa...","[uber, driver, keith, avila, picked, passenger...",uber driver keith avila picked passenger looke...
136099,"ACT essay scores are inexplicably low, causing...",Washington Post,Nick Anderson,2016-02-12,2016.0,2.0,Many students are in an uproar over a cha...,training,1,True,0.157549,many students are in an uproar over a cha...,"[many, student, are, in, an, uproar, over, a, ...","[many, student, uproar, change, act, ha, yield...",many student uproar change act ha yielded call...


## POS Tag

In [38]:
import nltk
df['POS_tagging'] = df['stopwords_removed'].apply(lambda x: nltk.pos_tag(x))

In [39]:
df.head(2)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning,is_political,sentiment,content_lowercase,lem_content,stopwords_removed,content_clean,POS_tagging
94841,Milwaukee ‘in turmoil’ as protesters riot afte...,New York Post,Sophia Rosenbaum,2016-08-14,2016.0,8.0,Milwaukee was burning Saturday following an ...,training,0,True,-0.154545,milwaukee was burning saturday following an ...,"[milwaukee, wa, burning, saturday, following, ...","[milwaukee, wa, burning, saturday, following, ...",milwaukee wa burning saturday following shooti...,"[(milwaukee, NN), (wa, NN), (burning, VBG), (s..."
142565,An eavesdropping Uber driver saved his 16-year...,Washington Post,Avi Selk,2016-12-30,2016.0,12.0,Uber driver Keith Avila picked up a p...,testing,1,True,0.074346,uber driver keith avila picked up a p...,"[uber, driver, keith, avila, picked, up, a, pa...","[uber, driver, keith, avila, picked, passenger...",uber driver keith avila picked passenger looke...,"[(uber, JJ), (driver, NN), (keith, NN), (avila..."


## Certain Part of Speech

In [40]:
#adjective, i.e. "small"
df['JJ'] = df['POS_tagging'].apply(lambda x: pos_list(x, 'JJ'))

In [41]:
#singular noun, i.e. "cheese"
df['NN'] = df['POS_tagging'].apply(lambda x: pos_list(x, 'NN'))

In [42]:
#base form verb, i.e. "run"
df['VB'] = df['POS_tagging'].apply(lambda x: pos_list(x, 'VB'))

In [43]:
#cardinal digit (numbers used to count "how many")
df['CD'] = df['POS_tagging'].apply(lambda x: pos_list(x, 'CD'))

In [44]:
#adj and noun
df['JJ, NN'] = df['POS_tagging'].apply(lambda x: pos_list(x, 'JJ' or 'NN'))

In [45]:
#adj and noun
df['NN, VB'] = df['POS_tagging'].apply(lambda x: pos_list(x, 'NN' or 'VB'))

In [46]:
#proportion of adjs, i.e. "small"
df['adj_proportion'] = df['POS_tagging'].apply(lambda x: pos_proportion(x, 'JJ'))

In [47]:
#proportion of singular nouns, i.e. "cheese"
df['noun_proportion'] = df['POS_tagging'].apply(lambda x: pos_proportion(x, 'NN'))

In [48]:
#proportion of base form verbs, i.e. "run"
df['verb_proportion'] = df['POS_tagging'].apply(lambda x: pos_proportion(x, 'VB'))

In [49]:
#proportion of cardinal digits (numbers used to count "how many")
df['c_digit_proportion'] = df['POS_tagging'].apply(lambda x: pos_proportion(x, 'CD'))

In [50]:
df.head(1)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning,is_political,...,JJ,NN,VB,CD,"JJ, NN","NN, VB",adj_proportion,noun_proportion,verb_proportion,c_digit_proportion
94841,Milwaukee ‘in turmoil’ as protesters riot afte...,New York Post,Sophia Rosenbaum,2016-08-14,2016.0,8.0,Milwaukee was burning Saturday following an ...,training,0,True,...,"[ignited, violent, numerous, brick, deadly, su...","[milwaukee, wa, saturday, protest, fire, build...",[get],"[100, one, two, one, five, 24]","[ignited, violent, numerous, brick, deadly, su...","[milwaukee, wa, saturday, protest, fire, build...",0.132597,0.552486,0.005525,0.033149


## Political Organizations

In [51]:
#left political organizations
dem_organization_words = [
    'AAPI Democrats', 'AfricanAmerican Dems','College Democrats','High School Democrats of America',
    'LGBT Democrats','Los Demócratas', 'National Federation of Democratic Women',
    'National Jewish Democratic Council','Stonewall Democrats','Young Democrats of America','DNC Women', 
    'Center for American Progress', 'Blue Dog Coalition', 'Democracy for America','ActBlue',
    'America Votes','Democrats for Life of America','New Democrat Coalition','New Democrat Network',
    'Progressive Caucus', 'Progressive Change Campaign Committee','Progressive Democrats of America',
    'Progressive Policy Institute','Moveon.org','America Coming Together','Democratic Leadership Council','Democrats for Life of America',
    'Democratic Congressional Campaign Committee','Democratic Governors Association',
    'Democratic National Committee','Democratic Senatorial Campaign Committee','Democrats Abroad',
    'National Conference of Democratic Mayors','Democratic Legislative Campaign Committee',
    'Democratic Attorneys General Association','Democratic Association of Secretaries of State',
    'National Democratic County Officials','Democratic Municipal Officials', 
]
df['dem_organization_count'] = df['content'].apply(lambda x: word_count(dem_organization_words, x))

In [52]:
#right political organizations
rep_organization_words = [
    'Americans for a Republican Majority','California Congress of Republicans','California Republican Assembly',
    'Capitol Hill Club','Republican Majority for Choice','Republicans for Choice','College Republicans',
    'Republican Conference of the United States House of Representatives','Republican Conference of the United States Senate',
    'National Republican Congressional Committee','Congressional Hispanic Conference','Congressional Institute',
    'ConservAmerica','Courageous Conservatives PAC','Delegates Unbound','Freedom Caucus','Georgia Teen Republicans',
    'GOPAC','Republican Governors Association','Hollywood Congress of Republicans','Hoover League','Huck PAC',
    'Idaho Federation of Reagan Republicans','International Republican Institute','Republican Jewish Coalition',
    'Kansas Traditional Republican Majority','Republican Leadership Council','Liberty Caucus','Republican National Coalition for Life',
    'Lincoln–Roosevelt League','Log Cabin Republicans','Republican Main Street Partnership','Mainstream Republicans of Washington','National Black Republican Association',
    'Republican National Committee','National Council for a New America','National Federation of Republican Assemblies',
    'National Federation of Republican Women','Republican Liberty Caucus','Republican National Hispanic Assembly',
    'Republican National Lawyers Association','Republican State Leadership Committee','Republicans Abroad',
    'Republicans Abroad Norway','Republicans for Immigration Reform','Republicans Overseas','RightChange.com',
    'RightNOW Women','Ripon Society','SarahPAC','National Republican Senatorial Committee','Republican Study Committee',
    'Tea Party Caucus','Teen Age Republicans','Texans for a Republican Majority','The Tuesday Group','Republican Unity Coalition',
    'The Wish List','Young Republicans',
]
df['rep_organization_count'] = df['content'].apply(lambda x: word_count(rep_organization_words, x))

In [53]:
df.head(1)

Unnamed: 0,title,publication,author,date,year,month,content,dataset,leaning,is_political,...,VB,CD,"JJ, NN","NN, VB",adj_proportion,noun_proportion,verb_proportion,c_digit_proportion,dem_organization_count,rep_organization_count
94841,Milwaukee ‘in turmoil’ as protesters riot afte...,New York Post,Sophia Rosenbaum,2016-08-14,2016.0,8.0,Milwaukee was burning Saturday following an ...,training,0,True,...,[get],"[100, one, two, one, five, 24]","[ignited, violent, numerous, brick, deadly, su...","[milwaukee, wa, saturday, protest, fire, build...",0.132597,0.552486,0.005525,0.033149,0,0


In [54]:
df.groupby(['dem_organization_count']).mean()
df.groupby(['rep_organization_count']).mean()

Unnamed: 0_level_0,year,month,leaning,is_political,sentiment,adj_proportion,noun_proportion,verb_proportion,c_digit_proportion,dem_organization_count
rep_organization_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2016.328496,5.542604,0.499048,True,0.077854,0.191267,0.464741,0.02054,0.033415,0.023181
1,2016.250774,6.055728,0.525076,True,0.085585,0.200045,0.472505,0.02218,0.028512,0.102584
2,2016.430622,5.473684,0.614679,True,0.095734,0.199892,0.473245,0.025996,0.02425,0.082569
3,2016.550562,5.078652,0.445652,True,0.084243,0.190999,0.486952,0.026056,0.024515,0.141304
4,2016.622222,5.155556,0.555556,True,0.089851,0.187613,0.495333,0.026813,0.026284,0.022222
5,2016.47619,5.714286,0.571429,True,0.098957,0.194482,0.467591,0.025469,0.024348,0.0
6,2016.809524,3.904762,0.583333,True,0.081694,0.186973,0.479589,0.031378,0.020686,0.0
7,2016.7,3.9,0.636364,True,0.085841,0.179717,0.499744,0.033647,0.024735,0.090909
8,2016.916667,3.75,0.75,True,0.078634,0.191342,0.488665,0.027975,0.018397,0.0
9,2016.833333,3.666667,0.833333,True,0.053449,0.188148,0.493743,0.027643,0.020542,0.0


## Three Datasets

In [55]:
df.groupby(['dataset']).size()

dataset
testing        3962
training      46752
validation    23528
dtype: int64

In [56]:
df_validation = df[df.dataset == 'validation']
df_training = df[df.dataset == 'training']

## Count Vectorizer Vocab

In [57]:
import itertools
#done in function, bottom are notes
to_be_put_in_cv_JJ = list(itertools.chain.from_iterable(df['JJ']))
long_word_JJ = [x for x in to_be_put_in_cv_JJ if len(x)>=5]
a = '''
to_be_put_in_cv_NN = list(itertools.chain.from_iterable(df['NN']))
long_word_NN = [x for x in to_be_put_in_cv_NN if len(x)>=5]
to_be_put_in_cv_VB = list(itertools.chain.from_iterable(df['VB']))
long_word_VB = [x for x in to_be_put_in_cv_VB if len(x)>=5]'''

## Count Vectorizer

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

### JJ Words

In [59]:
#JJ Words is for reference
#Rest will be completed using functions

In [60]:
vectorizer = CountVectorizer(vocabulary=set(long_word_JJ), token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(df_training['content_clean'].values)
JJ_array_training = X.toarray()
JJ_array_training

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [61]:
JJ_array_dataframe_training = pd.DataFrame(JJ_array_training, columns=vectorizer.get_feature_names())

In [62]:
JJ_array_dataframe_training

Unnamed: 0,000th,040th,080th,085total,100th,100yearsstrong,105th,110th,114th,115th,...,الجمارك,الكونغرس,الوليد,رمضان,عبدالعزيز,واصاب,وماجزاءالإحسان,ᴇᴀsᴛᴇʀsᴜɴᴅᴀʏ,まっすぐ地球からp,ｓｅｃｏｎｄ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46749,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46750,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
word_frequency_JJ_agg_training = pd.DataFrame(JJ_array_dataframe_training.sum()).reset_index()
word_frequency_JJ_agg_training

Unnamed: 0,index,0
0,000th,12
1,040th,0
2,080th,2
3,085total,0
4,100th,115
...,...,...
78112,واصاب,0
78113,وماجزاءالإحسان,1
78114,ᴇᴀsᴛᴇʀsᴜɴᴅᴀʏ,1
78115,まっすぐ地球からp,1


In [64]:
word_frequency_JJ_agg_training.columns = ['word', 'frequency']
word_frequency_JJ_agg_training = word_frequency_JJ_agg_training.sort_values('frequency')
top_50_frequent_JJ_words_training = word_frequency_JJ_agg_training[word_frequency_JJ_agg_training['word'].apply(lambda x: len(x) >= 5)].tail(50)['word']
top_50_frequent_JJ_words_training

7043           better
65874         student
37097           later
38580          little
24495           found
27487           great
68884          though
13563    conservative
4381            asked
7480            black
51977           place
16512      democratic
22841         federal
70963         twitter
11197          change
9892           called
16511        democrat
11648           child
54639        question
10064       candidate
53305    presidential
33692           issue
4727           attack
75772      washington
4089           around
75326           voter
66631         support
41795          medium
30004         hillary
2524          america
52308           point
24375          former
56566          report
54091          public
52381          police
48219        official
72729          united
45136        national
52417       political
76408           white
18339          donald
50990         percent
47886           obama
68790           think
57290           right
56630     

In [65]:
vectorizer = CountVectorizer(vocabulary=top_50_frequent_JJ_words_training, min_df=0, stop_words=frozenset(), token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(df_training['content_clean'].values)
JJ_array_training = X.toarray()
JJ_array_training

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 1, 18,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  5,  6,  7],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  0,  0, 30],
       [ 0,  0,  0, ...,  1,  0,  2]])

In [66]:
JJ_array_training.shape

(46752, 50)

### NN Words

In [67]:
#Functions for reference!
#def list_of_vocab(df, pos, word_length = 0):
#def top_n_words_array(list_of_vocab, df, column_name = 'content_clean', n = 50):
#def final_cv_array(top_n_words_array, df, column_name = 'content_clean'):

In [1]:
#building vocab, only works on df_training, (entire df has nan values, so this function doesn't work there)
top_50_frequent_NN_words_training = top_n_words_array(list_of_vocab(df, 'NN', 5), df_training, 'content_clean', 50)

NameError: name 'top_n_words_array' is not defined

### VB Words

In [None]:
#building vocab, only works on df_training
top_50_frequent_VB_words_training = top_n_words_array(list_of_vocab(df, 'VB', 5), df_training, 'content_clean', 50)

### CD Words

In [None]:
#building vocab, only works on df_training
top_50_frequent_CD_words_training = top_n_words_array(list_of_vocab(df, 'CD'), df_training, 'content_clean', 50)

### NN and VB Words

In [None]:
df.head(1)

In [None]:
#building vocab, only works on df_training
top_50_frequent_NN_VB_words_training = top_n_words_array(list_of_vocab(df, 'NN, VB'), df_training, 'content_clean', 50)

### JJ and NN Words

In [None]:
#building vocab, only works on df_training
top_50_frequent_JJ_NN_words_training = top_n_words_array(list_of_vocab(df, 'JJ, NN'), df_training, 'content_clean', 50)

## ML Models (Logistic Regression)

In [None]:
df.groupby(['dataset']).count()

### df_training, JJ

In [None]:
X_train_features = df_training[['sentiment', 'adj_proportion', 'noun_proportion', 'verb_proportion', 'c_digit_proportion']].to_numpy()
y_train = df_training['leaning'].apply(lambda x: int(x))
X_vectorizer_training = JJ_array_training
X_train = np.concatenate((X_train_features, X_vectorizer_training), axis = 1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_train)
confusion_matrix_training_JJ = confusion_matrix(y_train, y_pred)
confusion_matrix_training_JJ

In [None]:
accuracy(confusion_matrix_training_JJ)

### df_validation, JJ

In [None]:
#for reference only, but can be done using function
vectorizer = CountVectorizer(vocabulary=top_50_frequent_JJ_words_training, min_df=0, stop_words=frozenset(), token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(df_validation['content_clean'].values)
JJ_array_validation = X.toarray()
JJ_array_validation

In [None]:
X_val_features = df_validation[['sentiment', 'adj_proportion', 'noun_proportion', 'verb_proportion', 'c_digit_proportion']].to_numpy()
y_val = df_validation['leaning'].apply(lambda x: int(x))
X_vectorizer_validation = JJ_array_validation
X_val = np.concatenate((X_val_features, X_vectorizer_validation), axis = 1)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_val)
confusion_matrix_val_JJ = confusion_matrix(y_val, y_pred)
confusion_matrix_val_JJ

In [None]:
accuracy(confusion_matrix_val_JJ)

### df_training, NN

In [None]:
#function for reference
#def final_cv_array(top_n_words_array, df, column_name = 'content_clean'):
NN_array_training = final_cv_array(top_50_frequent_NN_words_training, df_training)

In [None]:
#X_train_features and y_train are same as previous 'JJ'
X_train_NN = np.concatenate((X_train_features, NN_array_training), axis = 1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train_NN, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_train_NN)
confusion_matrix_training_NN = confusion_matrix(y_train, y_pred)
confusion_matrix_training_NN

In [None]:
accuracy(confusion_matrix_training_NN)

### df_validation, NN

In [None]:
NN_array_validation = final_cv_array(top_50_frequent_NN_words_training, df_validation)

In [None]:
#X_val_features and y_val are same as previous 'JJ'
X_val_NN = np.concatenate((X_val_features, NN_array_validation), axis = 1)

In [None]:
y_pred = logreg.predict(X_val_NN)
confusion_matrix_val_NN = confusion_matrix(y_val, y_pred)
confusion_matrix_val_NN

In [None]:
accuracy(confusion_matrix_val_NN)

### df_training, VB

In [None]:
VB_array_training = final_cv_array(top_50_frequent_VB_words_training, df_training)

#X_train_features and y_train are same as previous 'JJ'
X_train_VB = np.concatenate((X_train_features, VB_array_training), axis = 1)


logreg = LogisticRegression()
logreg.fit(X_train_VB, y_train)

from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_train_VB)
confusion_matrix_training_VB = confusion_matrix(y_train, y_pred)
confusion_matrix_training_VB

In [None]:
accuracy(confusion_matrix_training_VB)

### df_validation, VB

In [None]:
VB_array_validation = final_cv_array(top_50_frequent_VB_words_training, df_validation)

#X_val_features and y_val are same as previous 'JJ'
X_val_VB = np.concatenate((X_val_features, VB_array_validation), axis = 1)

y_pred = logreg.predict(X_val_VB)
confusion_matrix_val_VB = confusion_matrix(y_val, y_pred)
confusion_matrix_val_VB

In [None]:
accuracy(confusion_matrix_val_VB)

### df_training, CD

In [None]:
CD_array_training = final_cv_array(top_50_frequent_CD_words_training, df_training)

#X_train_features and y_train are same as previous 'JJ'
X_train_CD = np.concatenate((X_train_features, CD_array_training), axis = 1)


logreg = LogisticRegression()
logreg.fit(X_train_CD, y_train)

y_pred = logreg.predict(X_train_CD)
confusion_matrix_training_CD = confusion_matrix(y_train, y_pred)
confusion_matrix_training_CD

In [None]:
accuracy(confusion_matrix_training_CD)

### df_validation, CD

In [None]:
CD_array_validation = final_cv_array(top_50_frequent_CD_words_training, df_validation)

#X_val_features and y_val are same as previous 'JJ'
X_val_CD = np.concatenate((X_val_features, CD_array_validation), axis = 1)

y_pred = logreg.predict(X_val_CD)
confusion_matrix_val_CD = confusion_matrix(y_val, y_pred)
confusion_matrix_val_CD

In [None]:
accuracy(confusion_matrix_val_CD)

### NN and VB

In [None]:
NN_VB_array_training = final_cv_array(top_50_frequent_NN_VB_words_training, df_training)

#X_train_features and y_train are same as previous 'JJ'
X_train_NN_VB = np.concatenate((X_train_features, NN_VB_array_training), axis = 1)


logreg = LogisticRegression()
logreg.fit(X_train_NN_VB, y_train)

from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_train_NN_VB)
confusion_matrix_training_NN_VB = confusion_matrix(y_train, y_pred)
confusion_matrix_training_NN_VB

In [None]:
accuracy(confusion_matrix_training_NN_VB)

In [None]:
NN_VB_array_validation = final_cv_array(top_50_frequent_NN_VB_words_training, df_validation)

#X_val_features and y_val are same as previous 'JJ'
X_val_NN_VB = np.concatenate((X_val_features, NN_VB_array_validation), axis = 1)

y_pred = logreg.predict(X_val_NN_VB)
confusion_matrix_val_NN_VB = confusion_matrix(y_val, y_pred)
confusion_matrix_val_NN_VB

In [None]:
accuracy(confusion_matrix_val_NN_VB)

## Cross Validation Accuracy

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression(max_iter=300)
print("JJ Train Score " + str(cross_val_score(logreg, X_train, y_train, cv=10).mean()))
print("NN Train Score " + str(cross_val_score(logreg, X_train_NN, y_train, cv=10).mean()))
print("VB Train Score " + str(cross_val_score(logreg, X_train_VB, y_train, cv=10).mean()))
print("CD Train Score " + str(cross_val_score(logreg, X_train_CD, y_train, cv=10).mean()))
print("NN and VB Train Score " + str(cross_val_score(logreg, X_train_NN_VB, y_train, cv=10).mean()))
#remember to record in spreadsheet

## Random Forest Classifier

### JJ

In [None]:
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_val)

In [None]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_val, y_pred))

In [None]:
clf.fit(X_train_NN,y_train)
y_pred_NN=clf.predict(X_val_NN)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_val, y_pred_NN))

In [None]:
clf.fit(X_train_VB,y_train)
y_pred_VB=clf.predict(X_val_VB)
print("Accuracy:", metrics.accuracy_score(y_val, y_pred_VB))

In [None]:
clf.fit(X_train_CD,y_train)
y_pred_CD=clf.predict(X_val_CD)
print("Accuracy:", metrics.accuracy_score(y_val, y_pred_CD))

In [None]:
#Pseudocode
#inputs: dataframe, n word length, POS
#create a dataframe with column names "num_chars", "POS", "top num_words, "validation accuracy"
#for num_chars in range(3,10):
#   for POS in POS_list:
#      for num_words in top_n_words:
#         Count Vectorizer stuff
#         Run model, get accuracy
#output: list of accuracies based on each POS and word length

In [None]:
#inputs: dataframe, n word length, POS
#chagne functions to include verbs like "get_word"
model_accuracy = pd.DataFrame(columns=['num_chars','POS','top_num_words', 'validation_accuracy']) #df with columns
POS_list = ['JJ', 'NN', 'VB', 'CD'] #list of POS

#PUT COMMENTS IN HEREEEE!!!
for num_chars in range(1,8): #filters out words less than num_chars 
    for POS in POS_list:
        for num_words in [10, 50, 100, 150, 200, 1000]:
            my_vocab = list_of_vocab(df_training, POS, num_chars) #creates list of vocab to be put in CV based on df_training dataset
            n_words_array = top_n_words_array(my_vocab, df_training, 'content_clean', num_words)
            cv_array = final_cv_array(n_words_array, df_training,)
            X_train_features = df_training[['sentiment', 'adj_proportion', 'noun_proportion', 'verb_proportion', 'c_digit_proportion']].to_numpy()
            y_train = df_training['leaning'].apply(lambda x: int(x))
            X_train = np.concatenate((X_train_features, cv_array), axis = 1)
            logreg = LogisticRegression()
            logreg.fit(X_train, y_train) #fits model (with df_training data)
            X_val_features = df_validation[['sentiment', 'adj_proportion', 'noun_proportion', 'verb_proportion', 'c_digit_proportion']].to_numpy()
            y_train_val = df_validation['leaning'].apply(lambda x: int(x))
            cv_array_val = final_cv_array(n_words_array, df_validation,)
            X_val = np.concatenate((X_val_features, cv_array_val), axis = 1)
            y_pred_val = logreg.predict(X_val) #uses logreg on df_validation this time
            confusion_matrix_val = confusion_matrix(y_val, y_pred_val)
            my_overall_accuracy = overall_accuracy(confusion_matrix_val)
            model_accuracy = model_accuracy.append({'num_chars': num_chars,
                                  'POS': POS,
                                  'top_num_words': num_words,                 
                                  'validation_accuracy': my_overall_accuracy}, ignore_index = True)  #adds row to model_accuracy each time   

In [None]:
model_accuracy

In [None]:
model_accuracy.groupby(['num_chars']).mean().reset_index()

In [None]:
model_accuracy.groupby(['POS']).mean().reset_index()
#cardinal digits had a significantly lower total accuracy in comparison to the other

In [None]:
model_accuracy.groupby(['top_num_words']).mean().reset_index()
#the more words used, the more accurate it seems to be until around 150 words

In [None]:
model_accuracy.sort_values(['validation_accuracy'])

In [None]:
for each_POS in ['JJ', 'NN', 'VB', 'CD']:
    plt.plot('num_chars', 'validation_accuracy', data=model_accuracy[model_accuracy.POS == each_POS].groupby(['num_chars']).mean().reset_index(), label=each_POS)
plt.legend()
plt.show()

In [None]:
for each_num_words in [10,50,100,150,1000]:
    plt.plot('num_chars', 'validation_accuracy', data=model_accuracy[model_accuracy.top_num_words == each_num_words].groupby(['num_chars']).mean().reset_index(), label=each_num_words)
plt.legend()
plt.show()