# Time Series Machine Learning Part 1 Assignment

In [111]:
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

### Import the Netflix stock price data set (NFLX_data.csv).

In [79]:
netflix_df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%205/NFLX_data.csv')

In [80]:
netflix_df['date'] = pd.to_datetime(netflix_df['date'])

In [81]:
netflix_df = netflix_df[['date', 'close']]

### Transform the data by shifting the series and creating features that will allow us to forecast the price 30 days into the future from 90 days of daily history.

In [82]:
history = 90
future = 30

shifts = [x + future for x in list(range(1, history + 1))]

for shift in shifts:
  netflix_df[f't-{shift}'] = netflix_df['close'].shift(shift)

netflix_df.dropna(inplace=True)

### Split the data into a training set and a testing set. Make the test set size 20%.

In [83]:
X = netflix_df.drop(['date', 'close'],1)
y = netflix_df['close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [84]:
X_test.shape[0]/netflix_df.shape[0]

0.20017559262510976

### Instantiate an AdaBoost model and fit it to the training set.

In [85]:
adareg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300)

In [86]:
np.arange(0.1, 1, 0.2)

array([0.1, 0.3, 0.5, 0.7, 0.9])

In [87]:
adareg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300)
params = {'learning_rate': np.arange(0.1, 1, 0.2),  
          'loss': ['linear', 'square', 'exponential']
          }

adareg_grid = GridSearchCV(adareg, param_grid=params, scoring='neg_mean_absolute_error', cv=3)

adareg_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                                              criterion='mse',
                                                                              max_depth=4,
                                                                              max_features=None,
                                                                              max_leaf_nodes=None,
                                                                              min_impurity_decrease=0.0,
                                                                              min_impurity_split=None,
                                                                              min_samples_leaf=1,
                                                                              min_samples_split=2,
                                                                              min_we

In [88]:
params = adareg_grid.best_params_

### Generate predictions for the test set.

In [89]:
y_pred = adareg_grid.predict(X_test)

In [90]:
X_test.shape

(228, 90)

### Evaluate the results using R-Squared, Mean Absolute Error, and Root Mean Squared Error metrics.

In [91]:
def get_scores(y_test, y_pred):
  print(f'R-Squared: {r2_score(y_test, y_pred)}')
  print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
  print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

In [92]:
get_scores(y_test, y_pred)

R-Squared: -5.028396231502538
MAE: 64.75218332400902
RMSE: 71.56040595642791


### Visually examine the results by creating a scatter plot where the x axis represents the observed results and the y axis represents the predictions.

In [93]:
fig = px.scatter(x=y_test, y=y_pred, template='none')

fig.update_traces(marker_line_color='black',
                  marker_line_width=1)

fig.show()

In [94]:
history = 90
future = 30

past_shifts = [x for x in list(range(1, history+1))]
future_shifts = [x for x in list(range(1, future))]

shifted = netflix_df.copy()[['date', 'close']]
shifted.columns = ['date', 't+0']

for shift in past_shifts:
  shifted[f't-{shift}'] = shifted['t+0'].shift(shift)

for shift in future_shifts:
  shifted[f't+{shift}'] = shifted['t+0'].shift(-shift)

shifted.dropna(inplace=True)

In [131]:
X = shifted[[f't-{shift}' for shift in past_shifts]]
future_shifts = [0] + future_shifts

models = []

for future_shift in future_shifts:
  y = shifted[f't+{future_shift}']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

  model = AdaBoostRegressor()
  model.set_params(**params)
  model.fit(X_train, y_train)
  
  predictions = model.predict(X_test)
  get_scores(y_test, predictions)

  pickled = pickle.dump(model, open(f'model_{future_shift}.pkl', 'wb')))
  models.append(model)

R-Squared: -2.1558857511790612
MAE: 11.876320497128294
RMSE: 14.369103541774654
R-Squared: -2.0754315841301723
MAE: 11.65261100418134
RMSE: 14.184762643713828
R-Squared: -1.9594526471960743
MAE: 11.386894032740727
RMSE: 13.994055451989233
R-Squared: -1.9998311757701495
MAE: 11.525697705802964
RMSE: 14.116237763879852
R-Squared: -2.0998237180605304
MAE: 11.872050654363989
RMSE: 14.424568185201714
R-Squared: -2.269817312799886
MAE: 12.384038901601839
RMSE: 14.86375711301429


#Lecture Notes

In [67]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%204/MSFT_data.csv')

In [68]:
data['date'] = pd.to_datetime(data['date'])

In [69]:
data = data[['date', 'close']]

In [70]:
history = 7
future = 7

shifts = [x + future for x in list(range(1, history+1))]

for shift in shifts:
  data[f't-{shift}'] = data['close'].shift(shift)

In [71]:
data.dropna(inplace=True)

In [72]:
X = data.drop(['date', 'close'], 1)
y = data['close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [73]:
from sklearn.linear_model import LinearRegression

In [74]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [75]:
predictions = model.predict(X_test)

In [76]:
get_scores(y_test, predictions)

R-Squared: 0.9283261442423832
MAE: 1.6442799946472546
RMSE: 2.2055656717470096


In [77]:
fig = px.scatter(x=y_test, y=predictions, template='none')

fig.update_traces(marker_line_color='black',
                  marker_line_width=1)

fig.show()

In [64]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
get_scores(y_test, predictions)

R-Squared: -2.009230989625027
MAE: 11.683654618473899
RMSE: 14.291156405470572


In [65]:
sorted(zip(model.feature_importances_, X_train.columns), reverse=True)

[(0.9418776859264845, 't-8'),
 (0.026707698546937903, 't-11'),
 (0.01512508287887898, 't-14'),
 (0.005108074549272794, 't-10'),
 (0.004245312104199292, 't-9'),
 (0.0038183889185095265, 't-12'),
 (0.0031177570757170226, 't-13')]

In [96]:
data.head()

Unnamed: 0,date,close,t-8,t-9,t-10,t-11,t-12,t-13,t-14
14,2013-03-01,27.95,28.045,28.01,28.04,28.03,27.88,27.86,27.55
15,2013-03-04,28.15,27.87,28.045,28.01,28.04,28.03,27.88,27.86
16,2013-03-05,28.35,27.49,27.87,28.045,28.01,28.04,28.03,27.88
17,2013-03-06,28.09,27.76,27.49,27.87,28.045,28.01,28.04,28.03
18,2013-03-07,28.14,27.37,27.76,27.49,27.87,28.045,28.01,28.04


In [100]:
history = 7
future = 5

shifted = data.copy()[['close']]
shifted.columns = ['t+0']

In [101]:
shifted

Unnamed: 0,t+0
14,27.95
15,28.15
16,28.35
17,28.09
18,28.14
...,...
1254,94.26
1255,91.78
1256,88.00
1257,91.33


In [103]:
past_shifts = [x for x in range(1, history+1)]
future_shifts = [x for x in range(1, future)]

for shift in past_shifts:
  shifted[f't-{shift}'] = shifted['t+0'].shift(shift)

for shift in future_shifts:
  shifted[f't+{shift}'] = shifted['t+0'].shift(-shift)

In [104]:
shifted.head()

Unnamed: 0,t+0,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t+1,t+2,t+3,t+4
14,27.95,,,,,,,,28.15,28.35,28.09,28.14
15,28.15,27.95,,,,,,,28.35,28.09,28.14,28.0
16,28.35,28.15,27.95,,,,,,28.09,28.14,28.0,27.87
17,28.09,28.35,28.15,27.95,,,,,28.14,28.0,27.87,27.91
18,28.14,28.09,28.35,28.15,27.95,,,,28.0,27.87,27.91,27.915


In [105]:
shifted.dropna(inplace=True)
shifted.head()

Unnamed: 0,t+0,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t+1,t+2,t+3,t+4
21,27.91,27.87,28.0,28.14,28.09,28.35,28.15,27.95,27.915,28.135,28.035,28.1
22,27.915,27.91,27.87,28.0,28.14,28.09,28.35,28.15,28.135,28.035,28.1,28.18
23,28.135,27.915,27.91,27.87,28.0,28.14,28.09,28.35,28.035,28.1,28.18,28.315
24,28.035,28.135,27.915,27.91,27.87,28.0,28.14,28.09,28.1,28.18,28.315,28.11
25,28.1,28.035,28.135,27.915,27.91,27.87,28.0,28.14,28.18,28.315,28.11,28.25


In [106]:
X = shifted[[f't-{shift}' for shift in past_shifts]]
X

Unnamed: 0,t-1,t-2,t-3,t-4,t-5,t-6,t-7
21,27.870,28.000,28.140,28.09,28.35,28.15,27.95
22,27.910,27.870,28.000,28.14,28.09,28.35,28.15
23,27.915,27.910,27.870,28.00,28.14,28.09,28.35
24,28.135,27.915,27.910,27.87,28.00,28.14,28.09
25,28.035,28.135,27.915,27.91,27.87,28.00,28.14
...,...,...,...,...,...,...,...
1250,92.330,91.820,91.900,91.61,90.00,90.10,90.14
1251,94.060,92.330,91.820,91.90,91.61,90.00,90.10
1252,93.920,94.060,92.330,91.82,91.90,91.61,90.00
1253,92.740,93.920,94.060,92.33,91.82,91.90,91.61


In [107]:
future_shifts = [0] + future_shifts

In [125]:
models = []

for future_shift in future_shifts:
  y = shifted[f't+{future_shift}']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

  model = RandomForestRegressor()
  model.fit(X_train, y_train)

  predictions = model.predict(X_test)

  get_scores(y_test, predictions)

  pickled_2 = pickle.dump(model, open(f'model_{future_shift}.pkl', 'wb'))
  models.append(pickled_2)

R-Squared: -1.6218431492319576
MAE: 10.324088663967643
RMSE: 13.097017001699058
R-Squared: -1.8265891495247497
MAE: 11.012155060728707
RMSE: 13.676319452390882
R-Squared: -1.820288444416911
MAE: 11.00742550607279
RMSE: 13.687285354085672
R-Squared: -1.7806201631838956
MAE: 10.951906072874431
RMSE: 13.661712959683795
R-Squared: -1.6930808042802141
MAE: 10.73177085020241
RMSE: 13.489366289584346


In [127]:
models[0].predict(X_test)

AttributeError: ignored

In [130]:
model_0 = pickle.load(open('model_0.pkl', 'rb'))
result = model_0.score(X_test, y_test)
result

-1.7007467271482586