In [91]:
import pandas as pd
import quandl as ql
import numpy as np
import math, datetime
import matplotlib.pyplot as plt
import pickle
from matplotlib import style
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression

style.use('ggplot')

In [61]:
df = ql.get("WIKI/GOOGL")
df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]

df.head()

Unnamed: 0_level_0,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,52.542193,54.167209,52.10083,53.164113,9188600.0


In [62]:
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

df

Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,50.322842,3.712563,0.324968,44659000.0
2004-08-20,54.322689,0.710922,7.227007,22834300.0
2004-08-23,54.869377,3.729433,-1.227880,18256100.0
2004-08-24,52.597363,6.417469,-5.726357,15247300.0
2004-08-25,53.164113,1.886792,1.183658,9188600.0
...,...,...,...,...
2018-03-21,1094.000000,1.343693,0.130884,1990515.0
2018-03-22,1053.150000,2.921711,-2.487014,3418154.0
2018-03-23,1026.550000,3.918952,-2.360729,2413517.0
2018-03-26,1054.090000,0.491419,0.332191,3272409.0


In [63]:
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)

forecast_out = int(math.ceil(0.01*len(df)))

df['label'] = df[forecast_col].shift(-forecast_out)
print(df.shape)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df)

(3424, 5)


Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.322842,3.712563,0.324968,44659000.0,69.078238
2004-08-20,54.322689,0.710922,7.227007,22834300.0,67.839414
2004-08-23,54.869377,3.729433,-1.22788,18256100.0,68.912727
2004-08-24,52.597363,6.417469,-5.726357,15247300.0,70.668146
2004-08-25,53.164113,1.886792,1.183658,9188600.0,71.219849
2004-08-26,54.12207,0.037068,2.820391,7094800.0,72.278116
2004-08-27,53.239345,2.326896,-1.803885,6211700.0,74.810934
2004-08-30,51.162935,3.41143,-3.106003,5196700.0,74.199045
2004-08-31,51.343492,1.308977,0.048866,4917800.0,70.462511
2004-09-01,50.28021,2.713217,-2.385589,9138200.0,74.921275


In [64]:
X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X = X[:-forecast_out]
X_lately = X[-forecast_out:]

df.dropna(inplace=True)
y = np.array(df['label'])
print(len(X), len(y))

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df.drop(['label'], 1))

3389 3389


In [65]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(accuracy)

0.9787140306565648


In [70]:
forecast_set = clf.predict(X_lately)
print(forecast_set, accuracy, forecast_out)

[1076.61545035 1091.43684312 1105.38711179 1099.59338041 1092.97831977
 1090.44329598 1088.95289839 1085.85925528 1079.58975176 1075.23369057
 1073.05216799 1093.38397624 1111.67691517 1115.47461931 1130.63939897
 1134.52489881 1132.85693146 1130.66294297 1132.32195989 1151.45710366
 1149.64096693 1160.00849735 1156.39475053 1164.45588308 1185.27546709
 1197.12840756 1190.83963153 1203.21700831 1208.71534188 1206.78945105
 1197.58754859 1203.07397151 1201.97814261 1137.18116073 1083.41794647] 0.9787140306565648 35


In [92]:
df['Forecast'] = np.nan
last_date = df.iloc[3388].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day
print(last_date, datetime.datetime.fromtimestamp(next_unix))

for i in forecast_set: 
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]

display(df)    
%matplotlib notebook
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

2018-02-05 00:00:00 2018-02-06 07:00:00


Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label,Forecast
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19 00:00:00,50.322842,3.712563,0.324968,44659000.0,69.078238,
2004-08-20 00:00:00,54.322689,0.710922,7.227007,22834300.0,67.839414,
2004-08-23 00:00:00,54.869377,3.729433,-1.227880,18256100.0,68.912727,
2004-08-24 00:00:00,52.597363,6.417469,-5.726357,15247300.0,70.668146,
2004-08-25 00:00:00,53.164113,1.886792,1.183658,9188600.0,71.219849,
...,...,...,...,...,...,...
2018-03-08 07:00:00,,,,,,1197.587549
2018-03-09 07:00:00,,,,,,1203.073972
2018-03-10 07:00:00,,,,,,1201.978143
2018-03-11 07:00:00,,,,,,1137.181161


<IPython.core.display.Javascript object>