# Sentiment Analysis using the New York Times API

In [40]:
# Import statements
import csv
import pandas as pd
import time
import re
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
%matplotlib inline

In [2]:
# Using API tool with key
from nytimesarticle import articleAPI
api = articleAPI('51ae5c44eb962681341060ede81808b8:11:73610715')

In [3]:
# Read in the dates to use for sentiment analysis
dframe=pd.read_csv('data/IYZ.csv')
date_list = list(dframe['date'])
cleaned_dates = []
for entry in date_list:
    cleaned_dates.append(entry.replace('-', ''))

In [4]:
# This function takes in a response to the NYT api and parses the articles into a list of dictionaries
def parse_articles(articles, datestamp):  
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['date'] = datestamp
        dic['text'] = i['headline']['main'].encode("utf8")
        if i['snippet'] is not None:
            dic['text'] = dic['text'] + " " + i['snippet'].encode("utf8")     
        news.append(dic)
    return(news) 

In [39]:
# This function accepts a list of dates and returns a dictionary of parsed articles for those dates 
def get_articles(dates,query):
    all_articles = []
    for date in dates:
        articles = api.search(q = query,
                fq = {'news_desk':['Business','Financial', 'Jobs','Retail','Outlook',
                                   'Personal Investing','Technology','Wealth']},
                begin_date = date,
                end_date = date,
                sort='oldest')
        articles = parse_articles(articles,date)
        if len(articles) != 0:
            all_articles  = all_articles + articles
        time.sleep(0.1)
    return(all_articles)

In [13]:
# Verizon_articles
verizon_articles = get_articles(cleaned_dates,'Verizon')
verizon_df = pd.DataFrame.from_dict(verizon_articles)

In [7]:
# Store dataframe in a CSV for future analysis
verizon_df.to_csv('data/verizon.csv')

In [6]:
# Read the CSV after manual classification of text as positive or negative
verizon_df = pd.read_csv('data/verizon.csv')

In [19]:
# Convert this dataframe back into a dictionary for faster processing
clean_dict = verizon_df.to_dict()

In [36]:
# Tokenize and clean the text
def make_sentence(word_arr):
    temp_sentence = ""
    for word in word_arr:
        temp_sentence = temp_sentence + word + " "
    return temp_sentence

text_arr = clean_dict['text'].values()
regex = re.compile('[^a-zA-Z]')
clean_arr = []

for sentence in text_arr:
    words = sentence.split()
    clean_sentence = []
    for word in words:
        if len(word) < 4:
            words.remove(word)
        clean_word = regex.sub('', word)
        clean_sentence.append(clean_word)
    clean_arr.append(make_sentence(clean_sentence))

In [38]:
len(clean_arr)

839