In [2]:
# Import
import json
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# from fastai.structured import add_datepart

# Linear Regression
from sklearn.linear_model import LogisticRegression

# NLTK
from nltk.sentiment import SentimentAnalyzer
import unicodedata
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Data Collecting

In [3]:
# Config
symbl = 'AAPL'
apiKey = 'MSXSR0YHC991CZN6'

# Fetch Historical Data
df = pd.read_csv('https://www.alphavantage.co/query?datatype=csv&function=TIME_SERIES_DAILY&symbol=' + symbl + '&outputsize=full&apikey=' + apiKey)

# Setting index as date
df['timestamp'] = pd.to_datetime(df.timestamp, format='%Y-%m-%d')
df.index = df['timestamp']

In [4]:
# Write dataframe to access locally
df.to_csv("daily_AAPL.csv", index=False)

# Data Overview

In [41]:
# Get general statistics
df.describe()

Unnamed: 0,open,high,low,close,volume
count,5031.0,5031.0,5031.0,5031.0,5031.0
mean,170.53149,172.387633,168.4982,170.481335,115885700.0
std,160.601243,161.878975,159.062451,160.489462,98504730.0
min,12.99,13.19,12.72,13.12,9835000.0
25%,55.365,56.924,54.515,55.6375,48840800.0
50%,117.95,119.4,116.68,118.31,88709600.0
75%,202.48,204.55,200.8175,202.74,152383700.0
max,702.41,705.07,699.57,702.1,1855410000.0


In [42]:
# Get data types
df.dtypes

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume                int64
dtype: object

In [43]:
# Check top 10 rows
df.head(10)

Unnamed: 0_level_0,timestamp,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-10-15,2019-10-15,236.39,237.64,234.88,235.32,19012889
2019-10-14,2019-10-14,234.9,238.13,234.67,235.87,24106900
2019-10-11,2019-10-11,232.95,237.64,232.31,236.21,41698900
2019-10-10,2019-10-10,227.93,230.44,227.3,230.09,28253400
2019-10-09,2019-10-09,227.03,227.79,225.64,227.03,18692600
2019-10-08,2019-10-08,225.82,228.06,224.33,224.4,27955000
2019-10-07,2019-10-07,226.27,229.93,225.84,227.06,30576500
2019-10-04,2019-10-04,225.64,227.49,223.89,227.01,34619700
2019-10-03,2019-10-03,218.43,220.96,215.13,220.82,28606500
2019-10-02,2019-10-02,223.06,223.58,217.93,218.96,34612300


In [44]:
# Check last 10 rows
df.tail(10)

Unnamed: 0_level_0,timestamp,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-10-29,1999-10-29,78.813,81.063,78.813,80.125,130762800
1999-10-28,1999-10-28,77.063,79.0,76.063,77.875,126022400
1999-10-27,1999-10-27,74.375,76.625,73.438,76.375,110768000
1999-10-26,1999-10-26,74.938,75.4992,73.313,75.063,90358800
1999-10-25,1999-10-25,74.25,76.125,73.7485,74.5,81648000
1999-10-22,1999-10-22,77.125,77.25,73.375,73.938,104876800
1999-10-21,1999-10-21,72.5627,77.063,72.375,76.125,198363200
1999-10-20,1999-10-20,70.0,75.25,70.0,75.125,270351200
1999-10-19,1999-10-19,71.625,75.0,68.438,68.5,255645600
1999-10-18,1999-10-18,73.875,74.25,71.125,73.25,194101600


# Convert closing price to trading action

In [5]:
# Expected Return
# expectedReturn = 0 # Long if the stock price stays the same or increases
expectedReturn = 0.025 # Long if the stock price increases by 2.5%

df['prev_close'] = df['close'].shift(-1)

df['action'] = np.nan 

for i, row in df.iterrows():
    realReturn = (df.loc[i, 'close'] / df.loc[i, 'prev_close']) - 1
    df.loc[i, 'action'] = 1 if (realReturn >= expectedReturn) else 0 # 1 = Long / 0 = Short


df.head(10)

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,prev_close,action
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-10-16,2019-10-16,233.37,235.2326,233.2,234.37,16507516,235.32,0.0
2019-10-15,2019-10-15,236.39,237.65,234.88,235.32,21840000,235.87,0.0
2019-10-14,2019-10-14,234.9,238.13,234.67,235.87,24106900,236.21,0.0
2019-10-11,2019-10-11,232.95,237.64,232.31,236.21,41698900,230.09,1.0
2019-10-10,2019-10-10,227.93,230.44,227.3,230.09,28253400,227.03,0.0
2019-10-09,2019-10-09,227.03,227.79,225.64,227.03,18692600,224.4,0.0
2019-10-08,2019-10-08,225.82,228.06,224.33,224.4,27955000,227.06,0.0
2019-10-07,2019-10-07,226.27,229.93,225.84,227.06,30576500,227.01,0.0
2019-10-04,2019-10-04,225.64,227.49,223.89,227.01,34619700,220.82,1.0
2019-10-03,2019-10-03,218.43,220.96,215.13,220.82,28606500,218.96,0.0


# Preparing data to merge with news data

In [6]:
df.drop(df.index[:10], inplace=True)
len(df)
df

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,prev_close,action
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-10-02,2019-10-02,223.0600,223.5800,217.9300,218.960,34612300,224.590,0.0
2019-10-01,2019-10-01,225.0700,228.2200,224.2000,224.590,34805800,223.970,0.0
2019-09-30,2019-09-30,220.9000,224.5800,220.7900,223.970,25977400,218.820,0.0
2019-09-27,2019-09-27,220.5400,220.9600,217.2800,218.820,25352000,219.890,0.0
2019-09-26,2019-09-26,220.0000,220.9400,218.8300,219.890,18833500,221.030,0.0
2019-09-25,2019-09-25,218.5500,221.5000,217.1400,221.030,21903400,217.680,0.0
2019-09-24,2019-09-24,221.0300,222.4900,217.1900,217.680,31190800,218.720,0.0
2019-09-23,2019-09-23,218.9500,219.8400,217.6500,218.720,19165500,217.730,0.0
2019-09-20,2019-09-20,221.3800,222.5600,217.4700,217.730,55413100,220.960,0.0
2019-09-19,2019-09-19,222.0100,223.7600,220.3700,220.960,22060600,222.770,0.0


In [7]:
truncate_df = df[(df['timestamp'].dt.year >= 2000)]
truncate_df.tail(10)

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,prev_close,action
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-14,2000-01-14,100.0,102.25,99.375,100.438,97594000,96.75,1.0
2000-01-13,2000-01-13,94.484,98.75,92.5,96.75,258171200,87.188,1.0
2000-01-12,2000-01-12,95.0,95.5012,86.5,87.188,244017200,92.75,0.0
2000-01-11,2000-01-11,95.938,99.375,90.5,92.75,110387200,97.75,0.0
2000-01-10,2000-01-10,102.0,102.25,94.75,97.75,126266000,99.5,0.0
2000-01-07,2000-01-07,96.5,101.0,95.5,99.5,115183600,95.0,1.0
2000-01-06,2000-01-06,106.1183,107.0,95.0,95.0,191993200,104.0,0.0
2000-01-05,2000-01-05,103.75,110.563,103.0,104.0,194580400,102.5,0.0
2000-01-04,2000-01-04,108.25,110.625,101.188,102.5,128094400,111.938,0.0
2000-01-03,2000-01-03,104.875,112.5,101.688,111.938,133949200,102.813,1.0


# Import News data 

In [8]:
def open_json(year, month):
    "Save as pandas dataframe"
    file_str = 'data/jsons/' + str(year) + '-' + '{:02}'.format(month) + '.json'
    with open(file_str) as data_file:    
        NYTimes_data = json.load(data_file)
    
    date_list = []
    df = pd.DataFrame()  
    df['News'] = None
    

    for i in range(len(NYTimes_data["response"]["docs"][:])):
        if NYTimes_data["response"]["docs"][i]["pub_date"][:10] not in df.index:
            df.loc[NYTimes_data["response"]["docs"][i]["pub_date"][:10]] = NYTimes_data["response"]["docs"][:][i]['headline']['main']
        else:
            df.loc[NYTimes_data["response"]["docs"][i]["pub_date"][:10]] = df.loc[NYTimes_data["response"]["docs"][i]["pub_date"][:10]].values + NYTimes_data["response"]["docs"][:][i]['headline']['main']
    
    df.index = pd.to_datetime(df.index, format='%Y-%m-%d')
    df.sort_index(inplace=True)
    
    return df

In [9]:
open_json(2009,1)

Unnamed: 0,News
2009-01-01,‘Spring Awakening’ Producers Join ‘Hair’ Reviv...
2009-01-02,"Picasso, Matisse Works Stolen in BerlinWilliam..."
2009-01-03,A Troubled Life and a Mysterious CaseYour comm...
2009-01-04,"Timeline: Israel, the Gaza Strip and HamasPavi..."
2009-01-05,Laura Bush Signs Deal for MemoirFringe Festiva...
2009-01-06,Barenboim Cancels Middle East ConcertsEnter th...
2009-01-07,Disputed Memoir May Be Published as FictionCha...
2009-01-08,Self-Publishing Company Acquires RivalEvery In...
2009-01-09,"Zantzinger, Who Inspired Bob Dylan Ballad, Die..."
2009-01-10,Your comments on the Poipet brothel columnA fe...


# Merge technical stock data with news

In [10]:
def merge_news_price(df):
    all_news = pd.DataFrame()
    for i in range(2000, 2020):
        for j in range(1, 13):
            try:
                temp = open_json(i, j)
                all_news = all_news.append(temp)
                print("Completed {}-{}".format(i, j))
            except:
                pass   
    all_news = pd.merge(df,all_news, how='left', left_index=True, right_index=True)
    
    return all_news

In [11]:
data = merge_news_price(truncate_df)

Completed 2000-1
Completed 2000-2
Completed 2000-3
Completed 2000-4
Completed 2000-5
Completed 2000-6
Completed 2000-7
Completed 2000-8
Completed 2000-9
Completed 2000-10
Completed 2000-11
Completed 2000-12
Completed 2001-1
Completed 2001-2
Completed 2001-3
Completed 2001-4
Completed 2001-5
Completed 2001-6
Completed 2001-7
Completed 2001-8
Completed 2001-9
Completed 2001-10
Completed 2001-11
Completed 2001-12
Completed 2002-1
Completed 2002-2
Completed 2002-3
Completed 2002-4
Completed 2002-5
Completed 2002-6
Completed 2002-7
Completed 2002-8
Completed 2002-9
Completed 2002-10
Completed 2002-11
Completed 2002-12
Completed 2003-1
Completed 2003-2
Completed 2003-3
Completed 2003-4
Completed 2003-5
Completed 2003-6
Completed 2003-7
Completed 2003-8
Completed 2003-9
Completed 2003-10
Completed 2003-11
Completed 2003-12
Completed 2004-1
Completed 2004-2
Completed 2004-3
Completed 2004-4
Completed 2004-5
Completed 2004-6
Completed 2004-7
Completed 2004-8
Completed 2004-9
Completed 2004-10
C

In [12]:
data.isnull().any()

timestamp     False
open          False
high          False
low           False
close         False
volume        False
prev_close    False
action        False
News           True
dtype: bool

In [13]:
data=data.dropna()

# Using NLTK sentiment analyzer to generate a polarity score

In [14]:
import nltk
nltk.download('vader_lexicon')

def sentimentanalysis(df):
    sid = SentimentIntensityAnalyzer()
    for date in df.index:
        try:
            sentence = unicodedata.normalize('NFKD', df.loc[date, 'News'])
            ss = sid.polarity_scores(sentence)
            df.at[date, 'neg'] = ss['neg']
            df.at[date, 'neu'] = ss['neu']
            df.at[date, 'pos'] = ss['pos']
        except TypeError:
            print (df.loc[date, 'News'])
            print (date)
    return df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/thupham/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
sentimental_data = sentimentanalysis(data)

2019-04-01    Las principales noticias del lunesReview: ‘Do ...
2019-04-01    For the Performing Arts Students in This Novel...
Name: News, dtype: object
2019-04-01 00:00:00
2019-04-01    Las principales noticias del lunesReview: ‘Do ...
2019-04-01    For the Performing Arts Students in This Novel...
Name: News, dtype: object
2019-04-01 00:00:00
2019-05-01    Las principales noticias del miércolesJudge Ov...
2019-05-01    The Designer Reimagining Traditional West Afri...
Name: News, dtype: object
2019-05-01 00:00:00
2019-05-01    Las principales noticias del miércolesJudge Ov...
2019-05-01    The Designer Reimagining Traditional West Afri...
Name: News, dtype: object
2019-05-01 00:00:00
2019-07-01    Corrections: July 1, 2019A Global Gay Pride We...
2019-07-01    The Best Movies and TV Shows New to Netflix, A...
Name: News, dtype: object
2019-07-01 00:00:00
2019-07-01    Corrections: July 1, 2019A Global Gay Pride We...
2019-07-01    The Best Movies and TV Shows New to Netflix, A...
Na

In [17]:
sentimental_data.drop(['News'], axis=1, inplace=True)

In [18]:
null_data = sentimental_data[sentimental_data.isnull().any(axis=1)]

In [19]:
null_data

Unnamed: 0,timestamp,open,high,low,close,volume,prev_close,action,neg,neu,pos
2019-04-01,2019-04-01,191.64,191.68,188.38,191.24,27862000,189.95,0.0,,,
2019-04-01,2019-04-01,191.64,191.68,188.38,191.24,27862000,189.95,0.0,,,
2019-05-01,2019-05-01,209.88,215.31,209.23,210.52,64827300,200.67,1.0,,,
2019-05-01,2019-05-01,209.88,215.31,209.23,210.52,64827300,200.67,1.0,,,
2019-07-01,2019-07-01,203.17,204.49,200.65,201.55,27253000,197.92,0.0,,,
2019-07-01,2019-07-01,203.17,204.49,200.65,201.55,27253000,197.92,0.0,,,
2019-08-01,2019-08-01,213.9,218.03,206.74,208.43,54017900,213.04,0.0,,,
2019-08-01,2019-08-01,213.9,218.03,206.74,208.43,54017900,213.04,0.0,,,


In [20]:
sentimental_data.dropna()

Unnamed: 0,timestamp,open,high,low,close,volume,prev_close,action,neg,neu,pos
2000-01-03,2000-01-03,104.8750,112.5000,101.6880,111.938,133949200,102.813,1.0,0.051,0.871,0.078
2000-01-04,2000-01-04,108.2500,110.6250,101.1880,102.500,128094400,111.938,0.0,0.056,0.904,0.039
2000-01-05,2000-01-05,103.7500,110.5630,103.0000,104.000,194580400,102.500,0.0,0.093,0.828,0.079
2000-01-06,2000-01-06,106.1183,107.0000,95.0000,95.000,191993200,104.000,0.0,0.079,0.835,0.086
2000-01-07,2000-01-07,96.5000,101.0000,95.5000,99.500,115183600,95.000,1.0,0.072,0.838,0.090
2000-01-10,2000-01-10,102.0000,102.2500,94.7500,97.750,126266000,99.500,0.0,0.081,0.850,0.068
2000-01-11,2000-01-11,95.9380,99.3750,90.5000,92.750,110387200,97.750,0.0,0.086,0.846,0.069
2000-01-12,2000-01-12,95.0000,95.5012,86.5000,87.188,244017200,92.750,0.0,0.115,0.789,0.096
2000-01-13,2000-01-13,94.4840,98.7500,92.5000,96.750,258171200,87.188,1.0,0.097,0.818,0.085
2000-01-14,2000-01-14,100.0000,102.2500,99.3750,100.438,97594000,96.750,1.0,0.097,0.832,0.071


In [21]:
# Write dataframe to access locally
sentimental_data.to_csv("daily_sentimental.csv", index=False)