In [128]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingRegressor # faster than GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [129]:
data = pd.read_csv('../data/data_2010_2021.csv', parse_dates=['Date']).set_index('Date')
data = data[data.index>='2020-03-01']
data['date'] = data.index
data = data[~data.stock_closing_usd.isna()]
print(data.shape)
print(data.isnull().sum().sort_values(0, ascending=False))
data.head(2)

(244, 286)
JODI_demand_DOMINICAN REPUBLIC                            244
JODI_demand_LIBYA                                         244
JODI_demand_NICARAGUA                                     244
JODI_demand_MYANMAR                                       244
JODI_demand_MOROCCO                                       244
JODI_demand_GUATEMALA                                     244
JODI_demand_CUBA                                          244
JODI_demand_MALAYSIA                                      244
JODI_demand_MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF    244
JODI_demand_KUWAIT                                        244
JODI_demand_HAITI                                         244
JODI_demand_GUYANA                                        244
JODI_demand_KAZAKHSTAN                                    244
JODI_demand_BOLIVIA, PLURINATIONAL STATE OF               244
JODI_demand_IRAN, ISLAMIC REPUBLIC OF                     244
JODI_demand_BELIZE                                        2

Unnamed: 0_level_0,stock_closing_usd,sentiment_global_index,sentiment_finance_index,DOW JONES COMPOSITE AVERAGE,DOW JONES INDUSTRIAL AVERAGE,DOW JONES TRANSPORTATION AVERAGE,DOW JONES UTILITY AVERAGE,S&P 500,ICE BRENT CRUDE OIL FUTURES,NYMEX CRUDE OIL FUTURES,...,WALKING_UNITED STATES_WEST VIRGINIA,WALKING_UNITED STATES_WISCONSIN,WALKING_UNITED STATES_WYOMING,GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE,PARKS PERCENT CHANGE FROM BASELINE,RESIDENTIAL PERCENT CHANGE FROM BASELINE,RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE,TRANSIT STATIONS PERCENT CHANGE FROM BASELINE,WORKPLACES PERCENT CHANGE FROM BASELINE,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-01,51.44,,,,,,,,,,...,64.19,141.77,107.8,10.852459,15.666667,-1.592593,19.145161,6.357143,5.37931,2020-03-01
2020-03-02,51.44,,,8746.05,26703.32,9475.68,886.52,3090.23,51.9,46.75,...,73.6,119.16,132.92,8.387097,20.6,-0.903226,10.032258,3.678571,4.66129,2020-03-02


In [218]:
# feature = data.isnull().sum().sort_values(0, ascending=False).tail(141).index.to_list()
basic_feature = ['date','stock_closing_usd']
mobility_feature = ['TRANSIT STATIONS PERCENT CHANGE FROM BASELINE','RESIDENTIAL PERCENT CHANGE FROM BASELINE','PARKS PERCENT CHANGE FROM BASELINE','GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE','WORKPLACES PERCENT CHANGE FROM BASELINE','RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE']
outlook_feature = ['CRUDE OIL AND LIQUID FUELS SUPPLY_MILLION BARRELS PER DAY','CRUDE OIL INVENTORY (EXCLUDING SPR)_MILLION BARRELS, END-OF-PERIOD','IMPORTED  CRUDE OIL REAL PRICE_REAL DOLLARS PER BARREL','OPEC TOTAL CRUDE OIL PRODUCTION CAPACITY_MILLION BARRELS PER DAY','U.S. CRUDE OIL PRODUCTION_MILLION BARRELS PER DAY']

feature = basic_feature + mobility_feature
print(feature)

data_subset = data[feature].sort_values('date')
print(data_subset.shape)
data_subset.head(2)

['date', 'stock_closing_usd', 'TRANSIT STATIONS PERCENT CHANGE FROM BASELINE', 'RESIDENTIAL PERCENT CHANGE FROM BASELINE', 'PARKS PERCENT CHANGE FROM BASELINE', 'GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE', 'WORKPLACES PERCENT CHANGE FROM BASELINE', 'RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE']
(244, 8)


Unnamed: 0_level_0,date,stock_closing_usd,TRANSIT STATIONS PERCENT CHANGE FROM BASELINE,RESIDENTIAL PERCENT CHANGE FROM BASELINE,PARKS PERCENT CHANGE FROM BASELINE,GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE,WORKPLACES PERCENT CHANGE FROM BASELINE,RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-01,2020-03-01,51.44,6.357143,-1.592593,15.666667,10.852459,5.37931,19.145161
2020-03-02,2020-03-02,51.44,3.678571,-0.903226,20.6,8.387097,4.66129,10.032258


In [219]:
if len(set(data_subset.columns) & set(outlook_feature))>0:
    for feature in set(data_subset.columns) & set(outlook_feature):
        print('Filling backward for ', feature)
        data_subset[feature] = data_subset[feature].bfill()

In [220]:
data_subset_feature = data_subset[mobility_feature].shift(7)
data_subset = pd.merge(data_subset['stock_closing_usd'],data_subset_feature,left_index=True, right_index=True)

# data_subset_7 = data_subset.shift(7)
# data_subset_30 = data_subset.shift(30)
# data_subset = pd.merge(data_subset_7, data_subset_30, suffixes=['_7','_30'], left_index=True, right_index=True)

# feature engineering
data_subset['workplaces_percent_changes'] = data_subset['WORKPLACES PERCENT CHANGE FROM BASELINE'].diff()
# data_subset['parks_percent_changes'] = data_subset['PARKS PERCENT CHANGE FROM BASELINE'].diff()

data_subset.head(10)

Unnamed: 0_level_0,stock_closing_usd,TRANSIT STATIONS PERCENT CHANGE FROM BASELINE,RESIDENTIAL PERCENT CHANGE FROM BASELINE,PARKS PERCENT CHANGE FROM BASELINE,GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE,WORKPLACES PERCENT CHANGE FROM BASELINE,RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE,workplaces_percent_changes
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-01,51.44,,,,,,,
2020-03-02,51.44,,,,,,,
2020-03-03,53.88,,,,,,,
2020-03-04,51.3,,,,,,,
2020-03-05,52.42,,,,,,,
2020-03-06,50.11,,,,,,,
2020-03-07,47.69,,,,,,,
2020-03-08,47.69,6.357143,-1.592593,15.666667,10.852459,5.37931,19.145161,
2020-03-09,40.88,3.678571,-0.903226,20.6,8.387097,4.66129,10.032258,-0.71802
2020-03-11,43.41,4.193548,-1.032258,13.045455,8.967742,3.645161,10.934426,-1.016129


In [221]:
# # forward fill the na data
# data_subset = data_subset.ffill()
# data_subset.isnull().sum()
# data_subset[data_subset.Date>'2020-02-27'].head(15)

# # drop na
data_subset = data_subset.dropna()

In [222]:
data_subset.describe()

Unnamed: 0,stock_closing_usd,TRANSIT STATIONS PERCENT CHANGE FROM BASELINE,RESIDENTIAL PERCENT CHANGE FROM BASELINE,PARKS PERCENT CHANGE FROM BASELINE,GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE,WORKPLACES PERCENT CHANGE FROM BASELINE,RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE,workplaces_percent_changes
count,236.0,236.0,236.0,236.0,236.0,236.0,236.0,236.0
mean,42.025466,-30.884384,10.844128,39.857129,-0.944195,-32.10087,-18.304375,-0.162351
std,5.490589,12.158621,4.779025,47.050734,10.704428,10.035654,14.083215,5.243835
min,31.47,-58.794118,-1.678571,-48.730769,-36.574074,-56.629032,-53.419355,-20.467742
25%,37.915,-37.978571,7.870536,-2.269565,-6.820833,-35.44506,-25.824561,-0.979839
50%,42.645,-31.828571,9.881795,35.86039,-1.846467,-32.725806,-15.965517,-0.150957
75%,45.0725,-22.953125,13.071822,82.857143,6.894143,-27.354839,-9.013158,0.921564
max,56.7,12.413793,24.653846,140.166667,37.322581,5.017544,19.241935,28.0


In [223]:
data_subset.columns

Index(['stock_closing_usd', 'TRANSIT STATIONS PERCENT CHANGE FROM BASELINE',
       'RESIDENTIAL PERCENT CHANGE FROM BASELINE',
       'PARKS PERCENT CHANGE FROM BASELINE',
       'GROCERY AND PHARMACY PERCENT CHANGE FROM BASELINE',
       'WORKPLACES PERCENT CHANGE FROM BASELINE',
       'RETAIL AND RECREATION PERECENT CHANGE FROM BASELINE',
       'workplaces_percent_changes'],
      dtype='object')

In [224]:
# normalize variables
# X = preprocessing.normalize(data_subset[mobility_feature])
# X

In [225]:
X = data_subset.drop(columns=['stock_closing_usd'])
# X = data_subset.drop(columns=['date_7','date_30','stock_closing_usd_7','stock_closing_usd_30'])
np.array(X)

array([[  3.67857143,  -0.90322581,  20.6       , ...,   4.66129032,
         10.03225806,  -0.71802002],
       [  4.19354839,  -1.03225806,  13.04545455, ...,   3.64516129,
         10.93442623,  -1.01612903],
       [  2.66666667,  -0.61904762,  13.52      , ...,   3.06451613,
          8.48387097,  -0.58064516],
       ...,
       [-36.25714286,  12.55      , -22.59375   , ..., -28.67741935,
        -24.95      ,  -1.29032258],
       [-42.87878788,  15.79661017, -21.56      , ..., -40.53225806,
        -35.70175439, -11.85483871],
       [-32.6969697 ,  11.01639344, -15.04166667, ..., -32.93548387,
        -16.31034483,   7.59677419]])

In [226]:
y = data_subset['stock_closing_usd']
y

Date
2020-03-09    40.88
2020-03-11    43.41
2020-03-16    38.12
2020-03-17    34.49
2020-03-19    33.37
2020-03-20    32.83
2020-03-21    32.74
2020-03-22    32.74
2020-03-23    31.47
2020-03-29    36.95
2020-03-30    36.95
2020-03-31    38.30
2020-04-01    36.49
2020-04-02    38.63
2020-04-03    42.07
2020-04-04    39.21
2020-04-05    39.21
2020-04-16    39.15
2020-04-17    43.22
2020-04-20    41.18
2020-04-21    40.96
2020-04-22    42.13
2020-04-23    43.45
2020-04-24    43.73
2020-04-27    43.94
2020-04-28    44.97
2020-04-29    47.46
2020-04-30    46.47
2020-05-01    43.14
2020-05-04    44.88
              ...  
2021-01-14    50.31
2021-01-15    47.89
2021-01-19    48.84
2021-01-20    49.53
2021-01-21    48.11
2021-01-22    47.43
2021-01-25    46.90
2021-01-26    45.87
2021-01-27    45.35
2021-01-28    46.06
2021-01-29    44.84
2021-02-01    44.92
2021-02-02    45.63
2021-02-03    47.42
2021-02-04    48.33
2021-02-05    49.95
2021-02-08    52.10
2021-02-09    50.63
2021-02-10    5

In [227]:
data_subset.shape

(236, 8)

In [228]:
# sample_data = pd.read_csv('..\sample_dataset\sample_data.csv')
# data = sample_data.copy()
# X = data[['Daily News Sentiment','Value'
# y = data[['Value']

In [229]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=8675309)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(188, 7) (48, 7) (188,) (48,)


In [230]:
model_dict = {
    'LR': LinearRegression(),
    'DT': DecisionTreeRegressor(),
    'RF': RandomForestRegressor(),
    'GB': GradientBoostingRegressor(),
    'HGB': HistGradientBoostingRegressor(),
    'XGB': XGBRegressor(),
    'LGBM': LGBMRegressor(),
    'CB': CatBoostRegressor(verbose=0),
    'SVR': SVR(C=1.0, epsilon=0.2),
    'LSVR': LinearSVR(random_state=0, tol=1e-5),
    'SGD': SGDRegressor(max_iter=1000, tol=1e-3)}

In [231]:
model_performance = pd.DataFrame(columns=['model','r2','mse'])

for model_name in model_dict.values():

    model = model_name.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = model.score(X, y)
    mse = mean_squared_error(y_test, y_pred)

    print('***************************************************\n',model_name)
    print('R2: ',r2)
    print('MSE: ',mse)

    performance = pd.DataFrame(np.array([[model_name, r2, mse]]), columns=['model','r2','mse'])
    model_performance = model_performance.append(performance)

model_performance = model_performance.reset_index(drop=True)

***************************************************
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
R2:  0.2937265007391531
MSE:  23.692389572240938
***************************************************
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
R2:  0.8940547930735583
MSE:  15.636722916666672
***************************************************
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_f

In [232]:
model_performance

Unnamed: 0,model,r2,mse
0,"LinearRegression(copy_X=True, fit_intercept=Tr...",0.293727,23.6924
1,"DecisionTreeRegressor(criterion='mse', max_dep...",0.894055,15.6367
2,"(DecisionTreeRegressor(criterion='mse', max_de...",0.872353,12.4942
3,([DecisionTreeRegressor(criterion='friedman_ms...,0.89164,13.701
4,HistGradientBoostingRegressor(l2_regularizatio...,0.854258,12.2947
5,"XGBRegressor(base_score=0.5, booster='gbtree',...",0.940759,8.74347
6,"LGBMRegressor(boosting_type='gbdt', class_weig...",0.842635,13.432
7,<catboost.core.CatBoostRegressor object at 0x0...,0.932418,9.58392
8,"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3...",0.220705,32.0621
9,"LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_i...",-0.905427,66.5672


## Graveyard

In [80]:
# LR = LinearRegression().fit(X_train, y_train)
# DT = DecisionTreeRegressor().fit(X_train, y_train)
# RF = RandomForestRegressor().fit(X_train, y_train)
# GB = GradientBoostingRegressor().fit(X_train, y_train)
# HGB = HistGradientBoostingRegressor().fit(X_train, y_train)
# XGB = XGBRegressor().fit(X_train, y_train)
# LGBM = LGBMRegressor().fit(X_train, y_train)
# CB = CatBoostRegressor().fit(X_train, y_train)

8895050	total: 682ms	remaining: 423ms
617:	learn: 0.8884334	total: 683ms	remaining: 422ms
618:	learn: 0.8873286	total: 684ms	remaining: 421ms
619:	learn: 0.8870899	total: 685ms	remaining: 420ms
620:	learn: 0.8859570	total: 686ms	remaining: 418ms
621:	learn: 0.8847718	total: 687ms	remaining: 417ms
622:	learn: 0.8840313	total: 688ms	remaining: 416ms
623:	learn: 0.8835356	total: 689ms	remaining: 415ms
624:	learn: 0.8829541	total: 690ms	remaining: 414ms
625:	learn: 0.8819185	total: 691ms	remaining: 413ms
626:	learn: 0.8813329	total: 692ms	remaining: 412ms
627:	learn: 0.8804734	total: 694ms	remaining: 411ms
628:	learn: 0.8794041	total: 695ms	remaining: 410ms
629:	learn: 0.8787543	total: 696ms	remaining: 409ms
630:	learn: 0.8774188	total: 697ms	remaining: 407ms
631:	learn: 0.8762291	total: 698ms	remaining: 406ms
632:	learn: 0.8745759	total: 699ms	remaining: 405ms
633:	learn: 0.8735642	total: 700ms	remaining: 404ms
634:	learn: 0.8730136	total: 701ms	remaining: 403ms
635:	learn: 0.8706550	tota

In [81]:
# # return R2
# LR.score(X, y)
# DT.score(X, y)
# RF.score(X, y)
# GB.score(X,y)
# HGB.score(X,y)
# XGB.score(X,y)
# LGBM.score(X,y)
# CB.score(X,y)

0.9964278055629735

In [12]:
# y_pred = LR.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = DT.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = RF.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = GB.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = HGB.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = XGB.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = LGBM.predict(X_test)
# mean_squared_error(y_test, y_pred)

# y_pred = CB.predict(X_test)
# mean_squared_error(y_test, y_pred)

19.627296658602503

In [91]:
# import numpy as np
# pd.DataFrame(np.array([y_test.transpose(), y_pred.transpose()]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,311,312,313,314,315,316,317,318,319,320
0,76.36,81.36,42.0,49.24,83.01,80.6,68.5,68.91,67.19,33.74,...,36.49,82.19,82.42,83.83,80.86,82.9,55.05,75.74,86.84,75.37
1,76.710763,81.485448,42.831874,45.434723,82.040642,80.74999,69.266482,69.271216,67.852414,34.26499,...,39.324425,82.147277,82.150494,86.736474,80.389033,83.069709,53.814115,76.405996,86.288367,75.812257
