## Regression

## inspired by Sentdex [Practical Machine learning Tutorial with Python](https://www.youtube.com/watch?v=lN5jesocJjk&list=PLQVvvaa0QuDfKTOs3Keq_kaG2P55YRn5v&index=3) 

In [27]:
# Imports
import pandas as pd
import numpy as np
import quandl

from sklearn.model_selection import cross_validate, train_test_split
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression

In [14]:
# quandl.ApiConfig.api_key = ''

# load Google Stock data
df = quandl.get('WIKI/GOOGL')

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


In [16]:
# High-low percentage
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100

# # Percentage change ((new - old) / old * 100)
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100

# Define useful columns
columns = [
    'Adj. Close',
    'Adj. Volume',
    'HL_PCT',
    'PCT_change',
]

df = df[columns]

df.head()

Unnamed: 0_level_0,Adj. Close,Adj. Volume,HL_PCT,PCT_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,50.322842,44659000.0,3.712563,0.324968
2004-08-20,54.322689,22834300.0,0.710922,7.227007
2004-08-23,54.869377,18256100.0,3.729433,-1.22788
2004-08-24,52.597363,15247300.0,6.417469,-5.726357
2004-08-25,53.164113,9188600.0,1.886792,1.183658


In [47]:
forecast_col = 'Adj. Close'

# fill column to prevent missing data
df.fillna(-99999, inplace=True)

import math

# output forecase, days in advance
forecast_out = int(math.ceil(0.01*len(df)))
forecast_out

34

In [48]:
# set the label to the shift of the forecast col

# you choose the forecase_columns and create new column
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,Adj. Close,Adj. Volume,HL_PCT,PCT_change,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.322842,44659000.0,3.712563,0.324968,69.639972
2004-08-20,54.322689,22834300.0,0.710922,7.227007,69.078238
2004-08-23,54.869377,18256100.0,3.729433,-1.22788,67.839414
2004-08-24,52.597363,15247300.0,6.417469,-5.726357,68.912727
2004-08-25,53.164113,9188600.0,1.886792,1.183658,70.668146


In [None]:
## to start implementing to fit the model, convert to numpy arrays

In [49]:
# X feature
X = np.array(df.drop(['label'], 1))

# y labels
y = np.array(df['label'])

# Scale up X
X = preprocessing.scale(X)
y = np.array(df['label'])

# Create the train and test set arrays
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X_test)

671

In [53]:
clf = LinearRegression()

# Train the model

# this is where you need to provide the features and labels to the function.
clf.fit(X_train, y_train)

# fit is synonymous with train 

# score is synonymous with test. 
accuracy = clf.score(X_test, y_test)
accuracy

0.9791505819516639

### have a look at another model

In [63]:
# super easy to switch your algorithm
# support vector regression
clf = svm.SVR()

# you can change kernels also. reducing accuracy in this case
clf = svm.SVR(kernel='poly')

# Train the model
clf.fit(X_train, y_train)
clf.score(X_test, y_test)



0.6640926570765042

In [64]:
# The accuracy is based on squared error for regression is perhaps more directionally acurate 
