In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing

import datetime
import math

In [2]:
df = pd.read_csv('data/AAPL.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-09-11,100.410004,101.440002,99.620003,101.43,93.071877,62353100
1,2014-09-12,101.209999,102.190002,101.080002,101.660004,93.282913,62626100
2,2014-09-15,102.809998,103.050003,101.440002,101.629997,93.255379,61316500
3,2014-09-16,99.800003,101.260002,98.889999,100.860001,92.548836,66908100
4,2014-09-17,101.269997,101.800003,100.589996,101.580002,93.209496,60926500


In [3]:
df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

# Reference

https://towardsdatascience.com/in-12-minutes-stocks-analysis-with-pandas-and-scikit-learn-a8d8a7b50ee7

In [4]:
df['DateFormat'] = df['Date'].apply(lambda date: datetime.datetime.strptime(date, '%Y-%m-%d'))
df.set_index('DateFormat', inplace=True)
df.drop('Date', axis=1, inplace=True)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
DateFormat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-09-11,100.410004,101.440002,99.620003,101.43,93.071877,62353100
2014-09-12,101.209999,102.190002,101.080002,101.660004,93.282913,62626100
2014-09-15,102.809998,103.050003,101.440002,101.629997,93.255379,61316500
2014-09-16,99.800003,101.260002,98.889999,100.860001,92.548836,66908100
2014-09-17,101.269997,101.800003,100.589996,101.580002,93.209496,60926500


In [5]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,145.550159,146.84566,144.271916,145.591638,140.453026,37541920.0
std,37.418659,37.786531,37.091876,37.435744,39.240803,18500870.0
min,90.0,91.669998,89.470001,90.339996,85.651482,11362000.0
25%,112.302502,113.404999,111.3925,112.490001,105.639296,24660580.0
50%,139.334999,140.064995,138.805,139.650002,134.383187,32678600.0
75%,174.754998,175.600003,173.585003,174.937504,171.546524,45752420.0
max,230.779999,233.470001,229.779999,232.070007,228.523819,162206300.0


In [6]:
df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,HL_PCT,PCT_change
DateFormat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-09-11,100.410004,101.440002,99.620003,101.43,93.071877,62353100,1.79434,1.015831
2014-09-12,101.209999,102.190002,101.080002,101.660004,93.282913,62626100,1.091875,0.444625
2014-09-15,102.809998,103.050003,101.440002,101.629997,93.255379,61316500,1.584179,-1.147749
2014-09-16,99.800003,101.260002,98.889999,100.860001,92.548836,66908100,2.349795,1.062122
2014-09-17,101.269997,101.800003,100.589996,101.580002,93.209496,60926500,1.191186,0.306117


In [7]:
# Drop missing value
df.fillna(value=-99999, inplace=True)

# We want to separate 1 percent of the data to forecast
forecast_out = int(math.ceil(0.01 * len(df)))

# Separating the label here, we want to predict the AdjClose
forecast_col = 'Adj Close'
df['label'] = df[forecast_col].shift(-forecast_out)
X = np.array(df.drop(['label'], 1))

# Scale the X so that everyone can have the same distribution for linear regression
X = preprocessing.scale(X)

# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
X_lately = X[-forecast_out:]
X = X[:-forecast_out]

# Separate label and identify it as y
y = np.array(df['label'])
y = y[:-forecast_out]

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,HL_PCT,PCT_change,label
DateFormat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-09-11,100.410004,101.440002,99.620003,101.43,93.071877,62353100,1.79434,1.015831,92.447876
2014-09-12,101.209999,102.190002,101.080002,101.660004,93.282913,62626100,1.091875,0.444625,91.007271
2014-09-15,102.809998,103.050003,101.440002,101.629997,93.255379,61316500,1.584179,-1.147749,91.667946
2014-09-16,99.800003,101.260002,98.889999,100.860001,92.548836,66908100,2.349795,1.062122,91.411026
2014-09-17,101.269997,101.800003,100.589996,101.580002,93.209496,60926500,1.191186,0.306117,91.411026


In [8]:
from sklearn.linear_model import LinearRegression

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20)


# Linear regression
clfreg = LinearRegression(n_jobs=-1)
clfreg.fit(X_train, y_train)

# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('polynomialfeatures',
                 PolynomialFeatures(degree=3, include_bias=True,
                                    interaction_only=False, order='C')),
                ('ridge',
                 Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001))],
         verbose=False)

In [9]:
confidencereg = clfreg.score(X_test, y_test)
confidencepoly2 = clfpoly2.score(X_test,y_test)
confidencepoly3 = clfpoly3.score(X_test,y_test)
# confidenceknn = clfknn.score(X_test, y_test)

print(confidencereg, ' ', confidencepoly2, ' ', confidencepoly3)

0.9506432567763123   0.9489296774351196   0.9476825171457169
