# Sentiment Analysis using the New York Times API

In [3]:
# Import statements
import csv
import pandas as pd
import numpy as np
import time
import re
import nltk
%matplotlib inline

In [4]:
# Using API tool with key
from nytimesarticle import articleAPI
api = articleAPI('51ae5c44eb962681341060ede81808b8:11:73610715')

In [5]:
# Read in the dates to use for sentiment analysis
dframe=pd.read_csv('data/IYZ.csv')
date_list = list(dframe['date'])
cleaned_dates = []
for entry in date_list:
    cleaned_dates.append(entry.replace('-', ''))

In [6]:
# This function takes in a response to the NYT api and parses the articles into a list of dictionaries
def parse_articles(articles, datestamp):  
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['date'] = datestamp
        dic['text'] = i['headline']['main'].encode("utf8")
        if i['snippet'] is not None:
            dic['text'] = dic['text'] + " " + i['snippet'].encode("utf8")     
        news.append(dic)
    return(news) 

In [7]:
# This function accepts a list of dates and returns a dictionary of parsed articles for those dates 
def get_articles(dates,query):
    all_articles = []
    for date in dates:
        articles = api.search(q = query,
                fq = {'news_desk':['Financial']},
                begin_date = date,
                end_date = date,
                sort='oldest')
        articles = parse_articles(articles,date)
        if len(articles) != 0:
            all_articles  = all_articles + articles
        time.sleep(0.1)
    return(all_articles)

In [8]:
# Verizon_articles
stock_articles = get_articles(cleaned_dates,'Verizon')
stock_df = pd.DataFrame.from_dict(stock_articles)

In [9]:
# Store dataframe in a CSV for future analysis
stock_df.to_csv('data/stock.csv')

In [13]:
# Read the CSV after manual classification of text as positive or negative
verizon_df = pd.read_csv('data/verizon.csv')
stock_articles

[]

In [7]:
# Convert this dataframe back into a dictionary for faster processing
clean_dict = verizon_df.to_dict()

In [8]:
# Tokenize and clean the text
def make_sentence(word_arr):
    temp_sentence = ""
    for temp_word in word_arr:
        temp_sentence = temp_sentence + temp_word + " "
    return temp_sentence

text_arr = clean_dict['text'].values()
regex = re.compile('[^a-zA-Z]')
clean_arr = []

for sentence in text_arr:
    words = sentence.split()
    clean_sentence = []
    for word in words:
        if len(word) < 4:
            word = ''
        clean_word = regex.sub('', word)
        clean_sentence.append(clean_word)
    clean_arr.append(make_sentence(clean_sentence))

In [31]:
# Separate data into training, validation, and test sets and make NLTK input tuples

# These lengths are manually derived at this time
index_training = 520
index_validation = 721

training_tups = []
validation_tups = []

def get_sentiment(list_index):
    if clean_dict['sentiment'][i] == 1:
        return "positive"
    else:
        return "negative"
    
for i in range(0,index_training):
    temp_list = [word.strip().lower() for word in clean_arr[i].split()]
    training_tups.append((temp_list,get_sentiment(i)))

for i in range(index_training,index_validation):
    temp_list = [word.strip().lower() for word in clean_arr[i].split()]
    validation_tups.append((temp_list,get_sentiment(i)))

In [75]:
# Creating the Naive Bayes Classifier

def get_words_in_text(text):
    all_words = []
    for (words, sentiment) in text:
        all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_text(training_tups))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

training_set = nltk.classify.apply_features(extract_features, training_tups)
validation_set = nltk.classify.apply_features(extract_features, validation_tups)
nb_classifier = nltk.NaiveBayesClassifier.train(training_set)

In [87]:
# View the most informative features of NB Classifier and compute accuracy

print nb_classifier.show_most_informative_features(10)
print "Accuracy of NB Classifier: %f"%nltk.classify.accuracy(classifier, validation_set)

Most Informative Features
         contains(shows) = True           negati : positi =      6.0 : 1.0
         contains(costs) = True           negati : positi =      6.0 : 1.0
         contains(users) = True           positi : negati =      4.9 : 1.0
       contains(goldman) = True           negati : positi =      4.7 : 1.0
     contains(questions) = True           negati : positi =      4.7 : 1.0
     contains(consumers) = True           negati : positi =      4.7 : 1.0
       contains(through) = True           positi : negati =      4.6 : 1.0
      contains(motorola) = True           positi : negati =      4.4 : 1.0
         contains(sachs) = True           negati : positi =      4.1 : 1.0
        contains(strike) = True           negati : positi =      4.1 : 1.0
None
Accuracy of NB Classifier: 0.447761


In [103]:
# Train with Maximum Entropy Classifier

me_classifier = nltk.MaxentClassifier.train(training_set,max_iter=100)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.487
         Final               nan        0.487


In [89]:
# View the most informative features of ME Classifier and compute accuracy

print me_classifier.show_most_informative_features(10)
print "Accuracy of ME Classifier: %f"%nltk.classify.accuracy(me_classifier, validation_set)

     nan contains(corporate)==False and label is 'positive'
     nan contains(pressed)==False and label is 'positive'
     nan contains(reiterated)==False and label is 'positive'
     nan contains(works)==False and label is 'positive'
     nan contains(components)==False and label is 'positive'
     nan contains(musicrelated)==False and label is 'positive'
     nan contains(rapidly)==False and label is 'positive'
     nan contains(evolve)==False and label is 'positive'
     nan contains(repurchase)==False and label is 'positive'
     nan contains(hewlettpackards)==False and label is 'positive'
None
Accuracy of ME Classifier: 0.582090


In [126]:
# Calculate sentiment scores for the entire dataset and store in CSV

sentiment_scores = []
for headline in clean_arr:
    temp_prob_dist = nb_classifier.prob_classify(extract_features(headline.lower().split()))
    score = temp_prob_dist.prob(temp_prob_dist.max())
    if temp_prob_dist.max() == 'positive':
        sentiment_scores.append(score*100)
    else:
        sentiment_scores.append(-100.*score)

sentiment_dict = dict(zip(cleaned_dates,sentiment_scores))
sentiment_df = pd.DataFrame.from_dict(sentiment_dict.items())
sentiment_df.to_csv('data/sentiment-scores.csv')
sentiment_df.head(5)

Unnamed: 0,0,1
0,20100108,99.97196
1,20120405,-57.750428
2,20120404,-99.516695
3,20120403,-99.978413
4,20120402,60.633337
