In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
from datetime import datetime
import seaborn
import glob



The purpose of the following steps is to calculate the sentiment analysis scores of the NY Times articles we have previously scraped.

In the first step, after we have read in the various CSVs for the articles, we use a function called "tuple_generator". The purpose of this function is to take the articles we have, and extract the month from the date category and then assign the month to each article. This will be useful because we are calculating monthly sentiment scores.

In [555]:
cache = [pd.read_csv(filename) for filename in glob.glob("/Users/Ted/Desktop/CS109/nytimes_data/*.csv")]

In [559]:
#argument: dataframe
#The purpose of this function is to take all the articles and to assign the month to each article
#We avoid using dictionaries because as long as this is in order of the year, it doesn't matter 
def tuple_generator(df):
    x = df["date"].str.replace("-", "/")
    lst = []
    for j in xrange(len(x)):
        spl = str(x[j]).split("/")
        if int(spl[0]) > 12:
            lst.append(int(spl[1]))
        else:
            lst.append(int(spl[0]))
    month = pd.Series(lst)
    df["month"] = month
    df['tuple_headline'] = list(zip(df["headline"], df["month"]))
    df['tuple_snippet'] = list(zip(df["snippet"], df["month"]))

    headline_date = list(df["tuple_headline"])
    snippet_date = list(df["tuple_snippet"])
    
    return headline_date, snippet_date

Next, our monthly_sentiment function makes a call to an external API which calculates the sentiment scores for us. The Sentiment Analysis aspect of the API requires making an HTTP POST request with form encoded data, and using the publicly available nltk-trainer to train on publicly available text datasets of movie reviews. The text classification uses a Naive Bayes Classifier and high information feature selection to build up a model, which calculates the polarity and neutrality of the text (and spits it back out as a decimal that adds to 1), and then within the polarity, it calculates the probability that a text if positive or negative (and spits it back out as a decimal that adds to 1). 


The documentation and links for the following referenced are below:

text-processing (the API we used): http://text-processing.com/
movie review data sets: http://www.cs.cornell.edu/people/pabo/movie-review-data/
nltk-trainer: https://github.com/japerk/nltk-trainer, http://nltk-trainer.readthedocs.io/en/latest/train_classifier.html
text classification using naive bayes: http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
text classification by eliminating low information features: http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/

In [66]:
#import simplejson because the errors messages are more descriptive and helpful
import simplejson

In [409]:
#takes the results of the tuple generator and returns
#the positive and neutral scores for each month

def monthly_sentiment(dt):
    neu_list = []
    pos_list = []
    r = requests.post("http://text-processing.com/api/sentiment/", data={"text": dt[0][0]})
    j = simplejson.loads(r.text)
    p = j['probability']
    sum_neu = p['neutral']
    sum_pos = p['pos']
    for i in range(1, len(dt)):
        r = requests.post("http://text-processing.com/api/sentiment/", data={"text": dt[i][0]})
        j = simplejson.loads(r.text)
        p = j['probability']
        pos = p['pos']
        neu = p['neutral']
        if (dt[i][1] == dt[i-1][1]):
            sum_pos += pos
            sum_neu += neu
        else:
            pos_list.append(sum_pos)
            neu_list.append(sum_neu)
            sum_pos = pos
            sum_neu = neu
    pos_list.append(sum_pos)
    neu_list.append(sum_neu)
    
    return pos_list, neu_list

Now that we have the functions ready, we run the tuple generator and monthly sentiment part on all the years. This process took a very, very long time because we were frequently rate limited/throttled by the API and had to wait 24 hours each time before trying again.

In [None]:
pos_lst = []
neu_lst = []
headline_lst = []
#runs tuple generator for every year, then runs monthly sentiment analysis
#WARNING: TAKES HOURS AND WILL TIME OUT AS THE API BLOCKS YOU
#ONCE TIMED OUT, MUST WAIT ~24 HOURS TO TRY AGAIN

for i in xrange(1982, 2017):
    h, s = tuple_generator(cache[i - 1982])
    headline_lst.append(h)
    pos, neu = monthly_sentiment(h)
    pos_lst.append(pos)
    neu_lst.append(neu)

In [480]:
#create dictionaries to store and map to each year
sent_pos = {}
sent_neut = {}

In [None]:
for i in xrange(1982, 2017):
    sent_pos[i] = pos_lst[i - 1982]
    sent_neut[i] = neu_lst[i - 1982]

In [391]:
#provides a count of how many articles there are in each month
#for the purpose of standardizing scores if need be
def standard (dt):
    d = {}
    for i in range(0, len(dt)):
        key = int(dt[i][1])
        if key in d:
            d[key] += 1
        else:
            d[key] = 1
    return d

In [588]:
# headlines: all the headlines from a year
# pos: all pos scores per month
# neu: all neutral scores per month
# year: the year for which this is done for

def generate_df(headlines, pos, neu, year):
    #count number of articles per month
    counts = standard(headlines)
    stand_pos = []
    stand_neu = []
    #loop through and save the standardized values
    for i in xrange(len(pos)):
        scored_pos = pos[i]/(counts[i+1])
        scored_neu = neg[i]/(counts[i+1])
        stand_pos.append(scored_pos)
        stand_neu.append(scored_neu)
    df = pd.DataFrame([pos, neu, stand_pos, stand_neu]).transpose()
    df.columns = ["positive", "neutral", "positive_standard", "neutral_standard"]
    #write to csv
    df.to_csv("sentiment_scores/SA_scores_%d.csv" % year)
    #return df for visual inspection
    return df

In [621]:
#example for 1982
generate_df(headline_lst[0], pos_lst[0], neg_lst[0], 1982)

Unnamed: 0,positive,neutral,positive_standard,neutral_standard
0,46.725509,61.355917,0.486724,0.639124
1,44.735867,54.228944,0.50265,0.609314
2,42.504184,52.014345,0.500049,0.611933
3,33.444549,44.547312,0.499172,0.664885
4,38.650605,48.745805,0.501956,0.633062
5,53.417927,68.43446,0.518621,0.664412
6,54.28053,71.679104,0.507295,0.669898
7,50.077008,66.189158,0.505828,0.668577
8,41.3271,58.086724,0.491989,0.691509
9,60.521036,78.901987,0.50858,0.663042


In [None]:
#loop through and generate CSV's for each year
for i in xrange(1982, 2017):
    curr = i - 1982
    generate_df(headline_lst[curr], pos_lst[curr], neu_lst[curr], i)