Develop a reasonably robust model that can

    1.	Predict next-day closing price for APPLE
    2.	Predict next-7-days closing price for APPLE

Note: You are free to do any analysis (visual or tabular) as a pre or post requirement for building the above model and related price prediction.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [4]:
all_stock = pd.read_csv("all_stocks_5yr.csv")
cons = pd.read_csv("constituents.csv")
all_data = all_stock.merge(cons, left_on=['Name'], right_on = ['Symbol'], suffixes=('_x', '_company'))
all_data.reset_index(inplace=True, drop=True)
all_data.head()

Unnamed: 0,date,open,high,low,close,volume,Name_x,Symbol,Name_company,Sector
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL,AAL,American Airlines Group,Industrials
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL,AAL,American Airlines Group,Industrials
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL,AAL,American Airlines Group,Industrials
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL,AAL,American Airlines Group,Industrials
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL,AAL,American Airlines Group,Industrials


In [None]:
all_comp = list(all_data["Name_company"].unique())
all_df = []
for each_comp in all_comp:
    tmp = all_data.loc[all_data.Name_company == each_comp]
    tmp.loc[:, "yoy_perc"] = tmp.loc[:, "close"].pct_change(periods=365)
    tmp.loc[:, "dod_perc"] = tmp.loc[:, "close"].pct_change(periods=1)
    tmp.loc[:, "avg_dod_perc"] = tmp.loc[:, "dod_perc"].rolling(2).mean()
    tmp.loc[:, "avg_yoy_perc"] = tmp.loc[:, "yoy_perc"].rolling(365).mean()
    tmp.loc[:, "mom_perc"] = tmp.loc[:, "close"].pct_change(periods=30)
    tmp.loc[:, "avg_mom_perc"] = tmp.loc[:, "mom_perc"].rolling(30).mean()
    all_df.append(tmp)
all_data_mod = pd.concat(all_df, axis=0)

splitted = all_data_mod['date'].str.split('-', expand=True)

all_data_mod.loc[:, 'day'] = splitted[2].astype('int')
all_data_mod.loc[:, 'month'] = splitted[1].astype('int')
all_data_mod.loc[:, 'year'] = splitted[0].astype('int')

d.	Develop a reasonably robust model that can

    1.	Predict next-day closing price for APPLE
    2.	Predict next-7-days closing price for APPLE

Note: You are free to do any analysis (visual or tabular) as a pre or post requirement for building the above model and related price prediction.  

## Data Processing

In [4]:
all_data_app = all_data_mod.loc[all_data_mod.Name_company == "Apple", :].sort_values("date", ascending=True)

splitted = all_data_app['date'].str.split('-', expand=True)
 
all_data_app['day'] = splitted[2].astype('int')
all_data_app['month'] = splitted[1].astype('int')
all_data_app['year'] = splitted[0].astype('int')

all_data_app['is_quarter_end'] = np.where(all_data_app['month']%3==0,1,0)
all_data_app['open-close']  = all_data_app['open'] - all_data_app['close']
all_data_app['low-high']  = all_data_app['low'] - all_data_app['high']

forecast_out = 1
#Create another column (the target ) shifted 'n' units up
all_data_app['pred1'] = all_data_app[['close']].shift(-1)
all_data_app['pred7'] = all_data_app[['close']].shift(-7)

weekly_trend = all_data_app.shift(1).rolling(7).sum()["pred1"]
all_data_app["weekly_trend1"] = weekly_trend

weekly_trend = all_data_app.shift(1).rolling(7).sum()["pred7"]
all_data_app["weekly_trend7"] = weekly_trend

weekly_mean = all_data_app.rolling(7).mean()["close"]
quarterly_mean = all_data_app.rolling(90).mean()["close"]
annual_mean = all_data_app.rolling(365).mean()["close"]

all_data_app["weekly_mean"] = weekly_mean / all_data_app["close"]
all_data_app["quarterly_mean"] = quarterly_mean / all_data_app["close"]
all_data_app["annual_mean"] = annual_mean / all_data_app["close"]

all_data_app["annual_weekly_mean"] = all_data_app["annual_mean"] / all_data_app["weekly_mean"]
all_data_app["annual_quarterly_mean"] = all_data_app["annual_mean"] / all_data_app["quarterly_mean"]

all_data_app["open_close_ratio"] = all_data_app["open"] / all_data_app["close"]
all_data_app["high_close_ratio"] = all_data_app["high"] / all_data_app["close"]
all_data_app["low_close_ratio"] = all_data_app["low"] / all_data_app["close"]

# Also calculate moving averages for features
all_data_app['ema50'] = all_data_app['close'] / all_data_app['close'].ewm(50).mean()
all_data_app['ema21'] = all_data_app['close'] / all_data_app['close'].ewm(21).mean()
all_data_app['ema15'] = all_data_app['close'] / all_data_app['close'].ewm(14).mean()
all_data_app['ema5'] = all_data_app['close'] / all_data_app['close'].ewm(5).mean()

# Instead of using the actual volume value (which changes over time), we normalize it with a moving volume average
all_data_app['normVol'] = all_data_app['volume'] / all_data_app['volume'].ewm(5).mean()
all_data_app = all_data_app.dropna()

all_data_app = all_data_app.drop(["Name_x", "Symbol", "Name_company", "Sector", "date"], axis=1)

features = ["yoy_perc", "dod_perc", "open", "high", "low", "close", "volume", "ema5", "ema15", "ema21", "ema50", "normVol", "weekly_mean", "quarterly_mean", "annual_mean", "open_close_ratio", "high_close_ratio", "low_close_ratio"]
features += ['low-high', 'open-close', "is_quarter_end", "day", "month", "year"]
all_data_app

Unnamed: 0,open,high,low,close,volume,yoy_perc,dod_perc,avg_dod_perc,avg_yoy_perc,mom_perc,...,annual_weekly_mean,annual_quarterly_mean,open_close_ratio,high_close_ratio,low_close_ratio,ema50,ema21,ema15,ema5,normVol
1988,107.010,107.030,104.82,105.26,40912316,0.111275,-0.019195,-0.016127,0.601531,-0.074149,...,1.073048,1.013548,1.016625,1.016816,0.995820,0.915056,0.933900,0.943469,0.971332,1.119811
1989,102.610,105.368,102.00,105.35,67649387,0.083959,0.000855,-0.009170,0.600577,-0.101799,...,1.075946,1.013585,0.973991,1.000171,0.968201,0.917352,0.937481,0.947797,0.976694,1.621481
1990,105.750,105.850,102.41,102.71,55790992,0.058539,-0.025059,-0.012102,0.599599,-0.135292,...,1.084622,1.014413,1.029598,1.030572,0.997079,0.896220,0.917576,0.928749,0.959862,1.266084
1991,100.560,102.370,99.87,100.70,68457388,0.031012,-0.019570,-0.022314,0.598421,-0.155909,...,1.095490,1.015699,0.998610,1.016584,0.991758,0.880777,0.903743,0.916035,0.950412,1.422312
1992,98.680,100.130,96.43,96.45,81094428,-0.025954,-0.042205,-0.030887,0.597023,-0.180892,...,1.111059,1.017314,1.023121,1.038154,0.999793,0.846199,0.870921,0.884605,0.924115,1.512252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2506,177.300,179.440,176.82,177.04,32689146,0.639259,0.000226,-0.003978,0.296472,0.045594,...,0.794249,0.845443,1.001469,1.013556,0.998757,1.058605,1.023036,1.014621,1.002567,1.146914
2507,177.250,177.300,173.20,174.22,51105090,0.614194,-0.015929,-0.007851,0.298609,0.028636,...,0.797110,0.845634,1.017392,1.017679,0.994145,1.040890,1.006432,0.998562,0.988806,1.583719
2508,174.505,174.950,170.53,171.11,41529004,0.581716,-0.017851,-0.016890,0.300637,-0.009035,...,0.801362,0.846034,1.019841,1.022442,0.996610,1.021863,0.988984,0.981998,0.975847,1.228220
2509,172.000,172.000,170.06,171.51,39143011,0.566588,0.002338,-0.007757,0.302556,-0.001107,...,0.807281,0.846328,1.002857,1.002857,0.991546,1.023765,0.991689,0.985325,0.981706,1.128015


In [5]:
sc_X = StandardScaler()
tmp =  pd.DataFrame(sc_X.fit_transform(all_data_app[features]), columns=features, index=all_data_app.index)

tmp = tmp.copy(deep=True)
all_data_app[features] = tmp[features]

In [6]:
# Train test split
train = all_data_app.iloc[:-100]
test = all_data_app.iloc[-100:]
X_train = train.drop(["pred1"], axis=1)
X_test = test.drop(["pred1"], axis=1)

y_train, y_test = train.pred1, test.pred1

In [7]:
X_train.head()

Unnamed: 0,open,high,low,close,volume,yoy_perc,dod_perc,avg_dod_perc,avg_yoy_perc,mom_perc,...,annual_weekly_mean,annual_quarterly_mean,open_close_ratio,high_close_ratio,low_close_ratio,ema50,ema21,ema15,ema5,normVol
1988,-0.824597,-0.857592,-0.876951,-0.89419,0.53418,-0.246066,-1.533732,-0.016127,0.601531,-0.074149,...,1.073048,1.013548,1.796639,1.515698,0.582281,-1.726286,-1.863708,-1.867751,-1.594971,0.442301
1989,-0.988606,-0.919439,-0.982606,-0.890831,2.274657,-0.330552,-0.006688,-0.00917,0.600577,-0.101799,...,1.075946,1.013585,-2.625105,-1.012761,-3.799247,-1.693436,-1.783352,-1.749256,-1.336383,2.169447
1990,-0.871564,-0.901503,-0.967245,-0.989357,1.502723,-0.409177,-1.980376,-0.012102,0.599599,-0.135292,...,1.084622,1.014413,3.142057,3.605349,0.782058,-1.995753,-2.229994,-2.270812,-2.148096,0.945889
1991,-1.065019,-1.031002,-1.06241,-1.06437,2.327255,-0.494314,-1.562272,-0.022314,0.598421,-0.155909,...,1.09549,1.015699,-0.071847,1.480517,-0.062159,-2.216692,-2.540386,-2.618933,-2.603876,1.48375
1992,-1.135096,-1.114357,-1.191294,-1.222981,3.149876,-0.670508,-3.286191,-0.030887,0.597023,-0.180892,...,1.111059,1.017314,2.470291,4.757265,1.212536,-2.71137,-3.276848,-3.479483,-3.872043,1.793393


In [8]:
y_train.head()

1988    105.35
1989    102.71
1990    100.70
1991     96.45
1992     96.96
Name: pred1, dtype: float64

## Regression models Next Day prediction

In [9]:
lr = LinearRegression()
# Train the model
lr.fit(X_train, y_train)

lr_confidence = lr.score(X_test, y_test)
y_pred = lr.predict(X_test)

print("lr confidence: ", lr_confidence)
print("accuracy: "+ str(lr.score(X_test,y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(y_test,y_pred)))
print("Mean squared error: {}".format(mean_squared_error(y_test,y_pred)))
test["prediction"] = y_pred

# import matplotlib.pyplot as plt
# importances = pd.DataFrame(data={
#     'Attribute': X_train.columns,
#     'Importance': lr.coef_
# })
# importances = importances.sort_values(by='Importance', ascending=False)
# print(importances)
# plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
# plt.title('Feature importances obtained from coefficients', size=20)
# plt.xticks(rotation='vertical')
# plt.show()

lr confidence:  0.964230708078302
accuracy: 96.4230708078302%
Mean absolute error: 1.288759158891124
Mean squared error: 2.4361190893959956


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["prediction"] = y_pred


In [10]:
rig_reg = linear_model.Ridge()
rig_reg.fit(X_train, y_train)
y_pred = rig_reg.predict(X_test)

print("accuracy: "+ str(rig_reg.score(X_test, y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(y_test,y_pred)))
print("Mean squared error: {}".format(mean_squared_error(y_test,y_pred)))
R2 = r2_score(y_test,y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': rig_reg.coef_
})
importances = importances.sort_values(by='Importance', ascending=False)

plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

accuracy: 97.39945513868524%
Mean absolute error: 1.1050983937322783
Mean squared error: 1.7711384931376053
R Squared: 0.9739945513868524
Adjusted R Squared: 0.9615740386163939


NameError: name 'plt' is not defined

# Regression model 7 day 

In [None]:
train = all_data_app.iloc[:-100]
test = all_data_app.iloc[-100:]
X_train = train.drop(["pred7"], axis=1)
X_test = test.drop(["pred7"], axis=1)

y_train, y_test = train.pred7, test.pred7

In [None]:
lr = LinearRegression()
# Train the model
lr.fit(X_train, y_train)

lr_confidence = lr.score(X_test, y_test)
y_pred = lr.predict(X_test)

print("lr confidence: ", lr_confidence)
print("accuracy: "+ str(lr.score(X_test,y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(y_test,y_pred)))
print("Mean squared error: {}".format(mean_squared_error(y_test,y_pred)))
test["prediction"] = y_pred

# import matplotlib.pyplot as plt
# importances = pd.DataFrame(data={
#     'Attribute': X_train.columns,
#     'Importance': lr.coef_
# })
# importances = importances.sort_values(by='Importance', ascending=False)

# plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
# plt.title('Feature importances obtained from coefficients', size=20)
# plt.xticks(rotation='vertical')
# plt.show()

In [None]:
rig_reg = linear_model.Ridge()
rig_reg.fit(X_train, y_train)
y_pred = rig_reg.predict(X_test)

print("accuracy: "+ str(rig_reg.score(X_test, y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(y_test,y_pred)))
print("Mean squared error: {}".format(mean_squared_error(y_test,y_pred)))
R2 = r2_score(y_test,y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': rig_reg.coef_
})
importances = importances.sort_values(by='Importance', ascending=False)

plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

END