In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import pairwise


os.chdir('C:/Users/Venia/Desktop/political-manifestos')
text=pd.read_csv("data/english_text.csv")
core = pd.read_csv("data/english_core.csv")
metadata = pd.read_csv("data/english_metadata.csv")

metadata['language'] = metadata['language'].fillna("english")
french_keys = metadata[metadata['language']=='french']['key'].values

words_to_remove = ['canada','canadians','america','americans','britain']
country_list = ['Canada','United Kingdom','United States']

df = pd.merge(text,core, on = 'key')
df = df[~df['key'].isin(french_keys)]

In [27]:
from string import punctuation, digits
import re

def clean_text(x):
    x = x.lower()
    x = x.replace("\n", " ")
    x = x.replace('\t', ' ')
    x = x.replace("\'", '')
    x=x.strip()
    remove_digits = str.maketrans('', '', digits)
    regex = re.compile('[%s]' % re.escape(punctuation))
    x = regex.sub('', x)
    x = x.translate(remove_digits)
    return x 
    

In [28]:
df['text_clean'] = df['text'].apply(lambda x: clean_text(x))

In [124]:

country_names = pd.read_csv("country_names.csv")
country_names = country_names[['countryLabel','capitalLabel','demonym']]
countries = country_names[['countryLabel','demonym']].drop_duplicates().reset_index()
demonyms = countries.groupby('countryLabel')['demonym'].apply(list).reset_index()
demonyms['list'] = demonyms.apply(lambda x: [x['countryLabel']] + x['demonym'], axis =1)
demonyms['list'] = demonyms['list'].apply(lambda x: [k.lower() for k in x])

import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))
df['text_clean_no_stop'] = df['text_clean'].str.replace(pat,'')
df['text_clean_no_stop'] = df['text_clean_no_stop'].str.replace(' +', ' ')
df['tokenized']=df['text_clean_no_stop'].apply(lambda x: x.split(' '))
countries = demonyms['list'].apply(lambda x: x[0]).values


In [125]:
for row in demonyms['list']:
    df[row[0]] = 0 
for row in demonyms['list']:
    country = row[0]
    for val in row:
        df[country] += df['tokenized'].apply(lambda x: x.count(val))

In [371]:
df['year'] = df['date'].apply(lambda x: str(x)[0:4])
country= df[df['countryname'] == 'United States']

In [372]:
def context_windows(country_df, country_name, n=4):
    list_of_instances = []
    metadata_list = []
    for i,j in country[['tokenized','countryname','date','partyname']].iterrows():
        metadata = j[['countryname','date','partyname']].values
        indices = [k for k, x in enumerate(j['tokenized']) if x == country_name]
        for i in indices:
            list_of_instances.append([k for k in j['tokenized'][i-n:i+n] if k != country_name])
            metadata_list.append(list(metadata))
    list_exploded = [k for j in list_of_instances for k in j]
    return metadata_list, list_of_instances, list_exploded

In [387]:
metadata, list_of_contexts, list_exploded = context_windows(country,'iraq')

In [388]:
def top_words(list_exploded):
    word_count  = {}
    for val in list_exploded:
        if val in word_count.keys():
            word_count[val] += 1
        else:
            word_count.update({val: 1})
    sorted_dict = dict(sorted(word_count.items(), key = lambda item:item[1], reverse=True))
    return sorted_dict

In [389]:
country_words = top_words(list_exploded)

In [390]:
country_contexts = pd.DataFrame(metadata,columns = ['country','date','party'])
country_contexts['contexts'] = list_of_contexts

In [391]:
election_words = country_contexts.groupby(['date', 'party'])['contexts'].apply(sum)

In [392]:
from collections import Counter
election_words = election_words.apply(lambda x: Counter(x))

In [393]:
election_words = election_words.apply(lambda x: dict(sorted(x.items(), key = lambda item:item[1], reverse=True)))

In [394]:
from itertools import islice
top_election_words = election_words.apply(lambda x: [k[0] for k in list(islice(x.items(), 4))])

In [395]:
top_election_words

date    party           
199211  Republican Party           [president, bush, quagmire, indefinite]
199611  Democratic Party                [iran, delivery, nuclear, weapons]
        Republican Party                       [iran, syria, libya, north]
200011  Democratic Party            [mass, destruction, delivery, systems]
        Republican Party                  [friends, community, long, also]
200411  Democratic Party    [administration, war, afghanistan, challenges]
        Republican Party           [afghanistan, nations, america, forces]
200811  Democratic Party                   [war, ending, end, responsibly]
        Republican Party         [offer, special, circumstances, conflict]
201211  Democratic Party              [war, responsibly, forces, charting]
        Republican Party           [afghanistan, country, provide, nature]
201611  Democratic Party                 [syria, partners, destroy, isis’]
        Republican Party              [afghanistan, support, force, guard]


In [382]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sentiment = top_election_words.explode().apply(sid.polarity_scores).apply(lambda x: x['compound'])

In [383]:
sentiment = sentiment.reset_index().groupby(['date','party'])['contexts'].mean()

In [384]:
sentiment

date    party           
196411  Democratic Party    0.191225
        Republican Party    0.095850
196811  Democratic Party    0.000000
        Republican Party   -0.014275
197211  Democratic Party    0.000000
        Republican Party   -0.149850
197611  Democratic Party   -0.259950
        Republican Party   -0.149850
198011  Democratic Party   -0.149850
        Republican Party    0.000000
198411  Democratic Party   -0.149850
        Republican Party    0.000000
198811  Democratic Party    0.111750
        Republican Party   -0.100475
199211  Democratic Party   -0.120050
        Republican Party    0.000000
199611  Democratic Party    0.175000
        Republican Party    0.114700
200011  Democratic Party   -0.223950
        Republican Party   -0.085000
200411  Republican Party   -0.079550
200811  Republican Party    0.119175
201211  Democratic Party    0.000000
        Republican Party    0.079550
201611  Republican Party    0.119175
Name: contexts, dtype: float64

In [396]:
df = pd.read_csv("for_sentiment_analysis.csv")

In [406]:
df

Unnamed: 0.1,Unnamed: 0,key,countryname,edate,tokenized,key2,sentiment
0,0,61320_200811,United States,04/11/2008,preamble,61320_200811_0,0.0000
1,0,61320_200811,United States,04/11/2008,come,61320_200811_0,0.0000
2,0,61320_200811,United States,04/11/2008,together,61320_200811_0,0.0000
3,0,61320_200811,United States,04/11/2008,defining,61320_200811_0,0.0000
4,0,61320_200811,United States,04/11/2008,moment,61320_200811_0,0.0000
...,...,...,...,...,...,...,...
27825,27,61620_200811,United States,04/11/2008,care,61620_200811_27,0.4939
27826,27,61620_200811,United States,04/11/2008,respect,61620_200811_27,0.4767
27827,27,61620_200811,United States,04/11/2008,earned,61620_200811_27,0.0000
27828,27,61620_200811,United States,04/11/2008,service,61620_200811_27,0.0000


In [401]:
sentiment = df['tokenized'].apply(sid.polarity_scores).apply(lambda x: x['compound'])

In [402]:
df['sentiment'] = sentiment

In [405]:
df.groupby(['key2'])['sentiment'].mean()

key2
61320_200811_0     0.025764
61320_200811_1     0.031206
61320_200811_10    0.016954
61320_200811_11    0.037287
61320_200811_12    0.027365
61320_200811_13    0.016122
61320_200811_2     0.038371
61320_200811_3     0.028138
61320_200811_4     0.040035
61320_200811_5     0.047219
61320_200811_6     0.035080
61320_200811_7     0.003643
61320_200811_8     0.005892
61320_200811_9     0.042842
61620_200811_14    0.019091
61620_200811_15    0.036707
61620_200811_16    0.033357
61620_200811_17    0.034850
61620_200811_18    0.014607
61620_200811_19    0.022966
61620_200811_20    0.027242
61620_200811_21    0.025334
61620_200811_22    0.038055
61620_200811_23    0.043667
61620_200811_24    0.039806
61620_200811_25   -0.005547
61620_200811_26    0.010474
61620_200811_27    0.022557
Name: sentiment, dtype: float64