# Sentiment Analysis using the New York Times API

In [49]:
# Import statements
import csv
import pandas as pd
import numpy as np
import time
%matplotlib inline

In [2]:
# Using API tool with key
from nytimesarticle import articleAPI
api = articleAPI('51ae5c44eb962681341060ede81808b8:11:73610715')

In [66]:
# Read in the dates to use for sentiment analysis
dframe=pd.read_csv('data/IYZ.csv')
date_list = list(dframe['date'])
cleaned_dates = []
dates_dict = {}
for entry in date_list:
    cleaned_dates.append(entry.replace('-', ''))
    dates_dict[entry.replace('-', '')] = 0.5
print len(dates_dict)

1483


In [4]:
# This function takes in a response to the NYT api and parses the articles into a list of dictionaries
def parse_articles(articles, datestamp):  
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['date'] = datestamp
        dic['text'] = i['headline']['main'].encode("utf8")
        if i['snippet'] is not None:
            dic['text'] = dic['text'] + " " + i['snippet'].encode("utf8")     
        news.append(dic)
    return(news) 

In [5]:
# This function accepts a list of dates and returns a dictionary of parsed articles for those dates 
def get_articles(dates,query):
    all_articles = []
    for date in dates:
        articles = api.search(q = query,
                fq = {'news_desk':['Business','Financial', 'Jobs','Retail','Outlook','Personal Investing','Technology','Wealth']},
                begin_date = date,
                end_date = date,
                sort='oldest')
        articles = parse_articles(articles,date)
        if len(articles) != 0:
            all_articles  = all_articles + articles
        time.sleep(0.1)
    return(all_articles)

In [7]:
# Telecommunications_articles
tele_articles = get_articles(cleaned_dates,'Telecommunications')
tele_df = pd.DataFrame.from_dict(tele_articles)

In [8]:
# Store dataframe in a CSV for future analysis
tele_df.to_csv('data/tele.csv')

In [26]:
# Read the CSV after manual classification of text as positive or negative
tele_df = pd.read_csv('data/tele.csv')

In [27]:
tele_df

Unnamed: 0.1,Unnamed: 0,date,text
0,0,20100104,Firms Selling Apps for Simple Phones Software ...
1,1,20100104,"Nuance Communications Buys SpinVox, Which Conv..."
2,2,20100107,AT&T to Sell Smartphones Using the Android Sys...
3,3,20100108,"Chinese Looking in America, but Not Buying As ..."
4,4,20100111,"Under Low-Key Chief, Canal Plus Prospers The F..."
5,5,20100113,Google Would Abandon a Lucrative Market Twenty...
6,6,20100114,Tough Confirmation for E.U.’s Ex-Antitrust Chi...
7,7,20100114,Billionaire Aims to Unite 3 Telecom Firms Amér...
8,8,20100122,Indictments Against 7 in Galleon Insider Case ...
9,9,20100125,Bankers Face a Harsh Spotlight in Davos With a...


In [79]:
#From: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
# AFINN is a list of English words rated for valence with an integer
# between minus five (negative) and plus five (positive). The words have
# been manually labeled by Finn Årup Nielsen in 2009-2011.
afinnfile = open("AFINN/AFINN-111.txt")
scores = {}
for line in afinnfile:
    term, score = line.split("\t")
    scores[term] = int(score)
scores

{'limited': -1,
 'suicidal': -2,
 'pardon': 2,
 'desirable': 2,
 'protest': -2,
 'lurking': -1,
 'controversial': -2,
 'hating': -3,
 'ridiculous': -3,
 'hate': -3,
 'aggression': -2,
 'increase': 1,
 'regretted': -2,
 'violate': -2,
 'granting': 1,
 'attracted': 1,
 'poorest': -2,
 'scold': -2,
 'bailout': -2,
 'sorry': -1,
 'regrets': -2,
 'struck': -1,
 'misreporting': -2,
 'vociferous': -1,
 'lurk': -1,
 'misunderstanding': -2,
 'distort': -2,
 'stolen': -2,
 'gratification': 2,
 'uncertain': -1,
 'stabbed': -2,
 'screaming': -2,
 'courageous': 2,
 'disturb': -2,
 'exaggerate': -2,
 'harried': -2,
 'solution': 1,
 'nigger': -5,
 'pardons': 2,
 'quaking': -2,
 'monopolized': -2,
 'censors': -2,
 'triumph': 4,
 'enjoy': 2,
 'shithead': -4,
 'tired': -2,
 'warns': -2,
 'landmark': 2,
 'elegant': 2,
 'fabulous': 4,
 'rigorous': 3,
 'emptiness': -1,
 'loathing': -3,
 'errors': -2,
 'hide': -1,
 'wreck': -2,
 'desirous': 2,
 'integrity': 2,
 'beaten': -2,
 'jocular': 2,
 'poison': -2,
 '

In [45]:
def calc_sent(text):
    score = 0
    for word in text.split(" "):
        if word in scores:
            score += scores.get(word,0)
    #rescale score to 0-1 range
    return (score/10.0)+0.5

In [46]:
print calc_sent("Firms Selling Apps for Simple Phones Software companies want to sell functions similar to those found on the iPhone to users of much simpler phones.")
print calc_sent("Nuance Communications Buys SpinVox, Which Converts Voice Mail to Text SpinVox was sold to Nuance Communications for $102.5 million, far less than the company was thought to be worth in recent years.")
print calc_sent("AT&T to Sell Smartphones Using the Android System The wireless carrier will offer five new devices by makers including Dell, Motorola and HTC.")
print calc_sent("Chinese Looking in America, but Not Buying As the United States housing market continues to struggle, potential buyers from China are checking out the market in increasing numbers.")
print calc_sent("Under Low-Key Chief, Canal Plus Prospers The French cable television group has defied skeptics, reversed the damage from Vivendi's failed expansion and insulated itself against the advertising recession.")

0.6
0.7
0.5
0.5
0.0


In [47]:
tele_df.rename(columns={'Unnamed: 0':'sentiment'}, inplace=True)
tele_df['sentiment'] = tele_df['text'].apply(calc_sent)
tele_df

Unnamed: 0,sentiment,date,text
0,0.6,20100104,Firms Selling Apps for Simple Phones Software ...
1,0.7,20100104,"Nuance Communications Buys SpinVox, Which Conv..."
2,0.5,20100107,AT&T to Sell Smartphones Using the Android Sys...
3,0.5,20100108,"Chinese Looking in America, but Not Buying As ..."
4,0.0,20100111,"Under Low-Key Chief, Canal Plus Prospers The F..."
5,0.4,20100113,Google Would Abandon a Lucrative Market Twenty...
6,0.5,20100114,Tough Confirmation for E.U.’s Ex-Antitrust Chi...
7,0.5,20100114,Billionaire Aims to Unite 3 Telecom Firms Amér...
8,0.3,20100122,Indictments Against 7 in Galleon Insider Case ...
9,0.4,20100125,Bankers Face a Harsh Spotlight in Davos With a...


In [70]:
bydate = tele_df.groupby('date')
temp_df = bydate.aggregate(np.mean).reset_index()
temp_df

Unnamed: 0,date,sentiment
0,20100104,0.650000
1,20100107,0.500000
2,20100108,0.500000
3,20100111,0.000000
4,20100113,0.400000
5,20100114,0.500000
6,20100122,0.300000
7,20100125,0.400000
8,20100126,0.475000
9,20100127,0.600000


In [71]:
for index, row in temp_df.iterrows():
    dates_dict[str(int(row['date']))] = row['sentiment']
print len(dates_dict)

1483


In [72]:
dates_dict

{'20120405': 0.20000000000000001,
 '20120404': 0.5,
 '20120403': 0.5,
 '20120402': 0.5,
 '20120409': 0.46666666666666662,
 '20150630': 0.5,
 '20130828': 0.5,
 '20120531': 0.5,
 '20120530': 0.5,
 '20130829': 0.5,
 '20110321': 0.56666666666666676,
 '20150522': 0.5,
 '20110323': 0.45000000000000001,
 '20110322': 0.56666666666666665,
 '20110325': 0.55000000000000004,
 '20110324': 0.5,
 '20110329': 0.34999999999999998,
 '20110328': 0.5,
 '20150529': 0.5,
 '20150528': 0.40000000000000002,
 '20121203': 0.53333333333333333,
 '20121206': 0.5,
 '20121207': 0.40000000000000002,
 '20121204': 0.5,
 '20121205': 0.5,
 '20120330': 0.5,
 '20110929': 0.5,
 '20110928': 0.5,
 '20110923': 0.5,
 '20110922': 0.5,
 '20110921': 0.5,
 '20110920': 0.5,
 '20110927': 0.5,
 '20110926': 0.5,
 '20121119': 0.5,
 '20121116': 0.5,
 '20121114': 0.099999999999999978,
 '20121115': 0.5,
 '20121112': 0.5,
 '20121113': 0.5,
 '20131104': 0.5,
 '20131105': 0.5,
 '20131106': 0.69999999999999996,
 '20131107': 0.5,
 '20131101': 0.

In [78]:
sent_df = pd.DataFrame(dates_dict.items(), columns=['date', 'sentiment']).sort('date')
print sent_df.shape
sent_df

(1483, 2)


Unnamed: 0,date,sentiment
761,20100104,0.650000
762,20100105,0.500000
763,20100106,0.500000
764,20100107,0.500000
754,20100108,0.500000
857,20100111,0.000000
856,20100112,0.500000
855,20100113,0.400000
859,20100114,0.500000
858,20100115,0.500000


In [80]:
# Store dataframe in a CSV for prediction
sent_df.to_csv('data/tele_sent.csv')