# Sentiment Analysis using the New York Times API

In [1]:
# Import statements
import csv
import pandas as pd
import numpy as np
import time
%matplotlib inline
import nltk

In [2]:
# Using API tool with key
from nytimesarticle import articleAPI
api = articleAPI('6030f83cb2a273755c7c7145f6b91168:18:73659797')

In [3]:
# Read in the dates to use for sentiment analysis
dframe=pd.read_csv('RFE/data/IYZ_from_2010-01-04_2015-12-02.csv')
date_list = list(dframe['date'])
cleaned_dates = []
dates_dict = {}
for entry in date_list:
    cleaned_dates.append(entry.replace('-', ''))
    dates_dict[entry.replace('-', '')] = 0.0
print len(dates_dict)

1490


In [4]:
# This function takes in a response to the NYT api and parses the articles into a list of dictionaries
def parse_articles(articles, datestamp):  
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['date'] = datestamp
        dic['text'] = i['headline']['main'].encode("utf8")
        if i['snippet'] is not None:
            dic['text'] = dic['text'] + " " + i['snippet'].encode("utf8")     
        news.append(dic)
    return(news) 

In [5]:
# This function accepts a list of dates and returns a dictionary of parsed articles for those dates 
def get_articles(dates,query):
    all_articles = []
    for date in dates:
        articles = api.search(q = query,
                fq = {'news_desk':['Business','Financial','Outlook','Personal Investing','Wealth']},
                begin_date = date,
                end_date = date,
                sort='oldest')
        articles = parse_articles(articles,date)
        if len(articles) != 0:
            all_articles  = all_articles + articles
        time.sleep(0.1)
    return(all_articles)

In [None]:
# Telecommunications_articles
tele_articles = get_articles(cleaned_dates,'economy')
tele_df = pd.DataFrame.from_dict(tele_articles)

In [None]:
# Store dataframe in a CSV for future analysis
tele_df.to_csv('sentiment_data/economy.csv')

In [6]:
# Read the CSV after manual classification of text as positive or negative
tele_df = pd.read_csv('sentiment_data/economy.csv')

In [7]:
tele_df

Unnamed: 0.1,Unnamed: 0,date,text
0,0,20100104,"Lax Oversight Caused Crisis, Bernanke Says Ben..."
1,1,20100104,Fault Lines Remain After Climate Talks Opinion...
2,2,20100104,A Second Stab at Convergence If media companie...
3,3,20100104,News Sites Dabble With a Web Tool for Nudging ...
4,4,20100104,"The Smartphone Makes and Breaks In a decade, c..."
5,5,20100105,Manufacturing Data Helps Invigorate Wall Stree...
6,6,20100105,Cold and Signs of Stronger Economy Drive Oil A...
7,7,20100105,Divergent Views on Signs of Life in the Econom...
8,8,20100105,Credit Suisse Is Accused of Defrauding Investo...
9,9,20100105,Google Moves to Keep Its Lead as Web Goes Mobi...


In [8]:
#From: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
# AFINN is a list of English words rated for valence with an integer
# between minus five (negative) and plus five (positive). The words have
# been manually labeled by Finn Årup Nielsen in 2009-2011.
#afinnfile = open("AFINN/AFINN-111.txt")
#afscores = {}
#for line in afinnfile:
#    term, score = line.split("\t")
#    afscores[term] = int(score)
#afscores

In [9]:
#From: http://www3.nd.edu/~mcdonald/Word_Lists.html
#This page contains some tools that are useful for textual analysis in financial applications and data
#LoughranMcDonald_MasterDictionary_2014
lmmd2014df = pd.read_csv('sentiment_data/LoughranMcDonald_MasterDictionary_2014.csv')
#afscores = {}
#for line in afinnfile:
#    term, score = line.split("\t")
#    afscores[term] = int(score)
#afscores
lmmd2014df = lmmd2014df[((lmmd2014df.Positive!=0) | (lmmd2014df.Negative!=0))][['Word','Positive','Negative']]
lmmd2014df.Word = lmmd2014df.Word.str.lower()
lmmd2014df['Sentiment'] = (lmmd2014df.Positive-lmmd2014df.Negative)/np.abs(lmmd2014df.Positive-lmmd2014df.Negative)

lmscores = {}
for index,row in lmmd2014df.iterrows():
    lmscores[row['Word']] = int(row['Sentiment'])
lmscores

{'restates': -1,
 'interference': -1,
 'dissolution': -1,
 'desirable': 1,
 'obstruction': -1,
 'protest': -1,
 'controversial': -1,
 'shrinkages': -1,
 'integrity': 1,
 'defraud': -1,
 'poorly': -1,
 'demotes': -1,
 'revoking': -1,
 'violate': -1,
 'obstructions': -1,
 'crises': -1,
 'nullifications': -1,
 'condemns': -1,
 'demoted': -1,
 'immature': -1,
 'distorts': -1,
 'collaborate': 1,
 'unsold': -1,
 'misunderstanding': -1,
 'distort': -1,
 'redact': -1,
 'insolvent': -1,
 'malfunctioned': -1,
 'disturb': -1,
 'exaggerate': -1,
 'persisted': -1,
 'delightfully': 1,
 'abolishing': -1,
 'monopolized': -1,
 'enjoy': 1,
 'disclose': -1,
 'picketed': -1,
 'force': -1,
 'diverting': -1,
 'warns': -1,
 'infringes': -1,
 'complicates': -1,
 'resignations': -1,
 'incompatible': -1,
 'errors': -1,
 'restatements': -1,
 'deliberate': -1,
 'worsen': -1,
 'revolutionize': 1,
 'profitability': 1,
 'overages': -1,
 'uneconomically': -1,
 'improperly': -1,
 'misjudged': -1,
 'invalidation': -1,


In [10]:
def calc_sent(text):
    score = 0
    for word in text.split(" "):
        if word in lmscores:
            score += lmscores.get(word,0)
    #rescale score to 0-1 range
    return (score)

In [None]:
print calc_sent("Firms Selling Apps for Simple Phones Software companies want to sell functions similar to those found on the iPhone to users of much simpler phones.")
print calc_sent("Nuance Communications Buys SpinVox, Which Converts Voice Mail to Text SpinVox was sold to Nuance Communications for $102.5 million, far less than the company was thought to be worth in recent years.")
print calc_sent("AT&T to Sell Smartphones Using the Android System The wireless carrier will offer five new devices by makers including Dell, Motorola and HTC.")
print calc_sent("Chinese Looking in America, but Not Buying As the United States housing market continues to struggle, potential buyers from China are checking out the market in increasing numbers.")
print calc_sent("Under Low-Key Chief, Canal Plus Prospers The French cable television group has defied skeptics, reversed the damage from Vivendi's failed expansion and insulated itself against the advertising recession.")

In [11]:
tele_df.rename(columns={'Unnamed: 0':'sentiment'}, inplace=True)
tele_df['sentiment'] = tele_df['text'].apply(calc_sent)
tele_df

Unnamed: 0,sentiment,date,text
0,1,20100104,"Lax Oversight Caused Crisis, Bernanke Says Ben..."
1,-1,20100104,Fault Lines Remain After Climate Talks Opinion...
2,0,20100104,A Second Stab at Convergence If media companie...
3,-1,20100104,News Sites Dabble With a Web Tool for Nudging ...
4,-2,20100104,"The Smartphone Makes and Breaks In a decade, c..."
5,0,20100105,Manufacturing Data Helps Invigorate Wall Stree...
6,0,20100105,Cold and Signs of Stronger Economy Drive Oil A...
7,0,20100105,Divergent Views on Signs of Life in the Econom...
8,-2,20100105,Credit Suisse Is Accused of Defrauding Investo...
9,1,20100105,Google Moves to Keep Its Lead as Web Goes Mobi...


In [12]:
if tele_df['sentiment'].max() > -tele_df['sentiment'].min():
    maxrange = tele_df['sentiment'].max()
else:
    maxrange = -tele_df['sentiment'].min()
print maxrange
tele_df['sentiment'] = 100*tele_df['sentiment']/maxrange
tele_df

6


Unnamed: 0,sentiment,date,text
0,16.666667,20100104,"Lax Oversight Caused Crisis, Bernanke Says Ben..."
1,-16.666667,20100104,Fault Lines Remain After Climate Talks Opinion...
2,0.000000,20100104,A Second Stab at Convergence If media companie...
3,-16.666667,20100104,News Sites Dabble With a Web Tool for Nudging ...
4,-33.333333,20100104,"The Smartphone Makes and Breaks In a decade, c..."
5,0.000000,20100105,Manufacturing Data Helps Invigorate Wall Stree...
6,0.000000,20100105,Cold and Signs of Stronger Economy Drive Oil A...
7,0.000000,20100105,Divergent Views on Signs of Life in the Econom...
8,-33.333333,20100105,Credit Suisse Is Accused of Defrauding Investo...
9,16.666667,20100105,Google Moves to Keep Its Lead as Web Goes Mobi...


In [13]:
bydate = tele_df.groupby('date')
temp_df = bydate.aggregate(np.mean).reset_index()
temp_df

Unnamed: 0,date,sentiment
0,20100104,-10.000000
1,20100105,-3.333333
2,20100106,-7.142857
3,20100107,-3.703704
4,20100108,-1.666667
5,20100111,-4.166667
6,20100112,-11.111111
7,20100113,-1.666667
8,20100114,0.000000
9,20100115,-2.083333


In [14]:
for index, row in temp_df.iterrows():
    dates_dict[str(int(row['date']))] = row['sentiment']
print len(dates_dict)

1490


In [15]:
dates_dict

{'20120405': -9.5238095238095237,
 '20120404': 0.0,
 '20120403': 1.8518518518518521,
 '20120402': 0.0,
 '20120409': -4.166666666666667,
 '20150630': -22.916666666666668,
 '20130828': 11.111111111111112,
 '20120531': -9.5238095238095237,
 '20120530': -2.7777777777777781,
 '20130829': -13.333333333333334,
 '20110321': 0.0,
 '20150522': 0.0,
 '20110323': -22.222222222222225,
 '20110322': 0.0,
 '20110325': -14.814814814814817,
 '20110324': -14.285714285714286,
 '20110329': -5.5555555555555562,
 '20110328': 0.0,
 '20150529': 0.0,
 '20150528': -10.0,
 '20121203': 0.0,
 '20121206': -8.3333333333333339,
 '20121207': -4.7619047619047619,
 '20121204': -13.333333333333334,
 '20121205': -11.111111111111111,
 '20120330': -13.888888888888891,
 '20110929': -6.666666666666667,
 '20110928': -12.5,
 '20110923': -9.2592592592592595,
 '20110922': -8.3333333333333339,
 '20110921': 1.6666666666666667,
 '20110920': -13.333333333333334,
 '20110927': 0.0,
 '20110926': 2.3809523809523809,
 '20121119': -4.166666

In [16]:
sent_df = pd.DataFrame(dates_dict.items(), columns=['date', 'sentiment']).sort('date')
print sent_df.shape
sent_df

(1490, 2)


Unnamed: 0,date,sentiment
764,20100104,-10.000000
765,20100105,-3.333333
766,20100106,-7.142857
767,20100107,-3.703704
757,20100108,-1.666667
860,20100111,-4.166667
859,20100112,-11.111111
858,20100113,-1.666667
862,20100114,0.000000
861,20100115,-2.083333


In [17]:
# Store dataframe in a CSV for prediction
sent_df.to_csv('sentiment_data/economy-scores.csv')