In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingRegressor # faster than GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [2]:
data = pd.read_csv('../data/data_2010_2021.csv', parse_dates=True)
data = data[data.Date>='2016-01-01']
data = data[~data.stock_closing_usd.isna()]
print(data.shape)
print(data.isnull().sum().sort_values(0, ascending=False))
data.head(2)

(1291, 597)
JODI_demand_HAITI              1291
JODI_demand_GRENADA            1291
JODI_demand_SURINAME           1291
JODI_demand_BANGLADESH         1291
JODI_demand_PARAGUAY           1291
                               ... 
S&P 500                           9
NYMEX CRUDE OIL FUTURES           9
NYMEX RBOB GASOLINE FUTURES       9
stock_closing_usd                 0
Date                              0
Length: 597, dtype: int64


Unnamed: 0,Date,stock_closing_usd,sentiment_global_index,sentiment_finance_index,DOW JONES COMPOSITE AVERAGE,DOW JONES INDUSTRIAL AVERAGE,DOW JONES TRANSPORTATION AVERAGE,DOW JONES UTILITY AVERAGE,S&P 500,ICE BRENT CRUDE OIL FUTURES,...,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_SULLIVAN COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_TIOGA COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_TOMPKINS COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_ULSTER COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_WARREN COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_WASHINGTON COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_WAYNE COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_WESTCHESTER COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_WYOMING COUNTY,WORKPLACES PERCENT CHANGE FROM BASELINE_NEW YORK_YATES COUNTY
2194,2016-01-04,77.46,,,5890.29,17148.94,7352.59,577.48,2012.66,37.22,...,,,,,,,,,,
2195,2016-01-05,78.12,,,5900.51,17158.66,7363.95,580.97,2016.71,36.42,...,,,,,,,,,,


In [3]:
feature = data.isnull().sum().sort_values(0, ascending=False).tail(10).index.to_list()
print(feature)

data_subset = data[feature]
print(data_subset.shape)
data_subset.head(2)

['DOW JONES TRANSPORTATION AVERAGE', 'DOW JONES COMPOSITE AVERAGE', 'DOW JONES INDUSTRIAL AVERAGE', 'ICE BRENT CRUDE OIL FUTURES', 'DOW JONES UTILITY AVERAGE', 'S&P 500', 'NYMEX CRUDE OIL FUTURES', 'NYMEX RBOB GASOLINE FUTURES', 'stock_closing_usd', 'Date']
(1291, 10)


Unnamed: 0,DOW JONES TRANSPORTATION AVERAGE,DOW JONES COMPOSITE AVERAGE,DOW JONES INDUSTRIAL AVERAGE,ICE BRENT CRUDE OIL FUTURES,DOW JONES UTILITY AVERAGE,S&P 500,NYMEX CRUDE OIL FUTURES,NYMEX RBOB GASOLINE FUTURES,stock_closing_usd,Date
2194,7352.59,5890.29,17148.94,37.22,577.48,2012.66,36.76,1.29,77.46,2016-01-04
2195,7363.95,5900.51,17158.66,36.42,580.97,2016.71,35.97,1.26,78.12,2016-01-05


In [4]:
# # forward fill the na data
# data_subset = data_subset.ffill()
# data_subset.isnull().sum()
# data_subset[data_subset.Date>'2020-02-27'].head(15)

# # drop na
data_subset = data_subset.dropna()

In [5]:
data_subset.describe()

Unnamed: 0,DOW JONES TRANSPORTATION AVERAGE,DOW JONES COMPOSITE AVERAGE,DOW JONES INDUSTRIAL AVERAGE,ICE BRENT CRUDE OIL FUTURES,DOW JONES UTILITY AVERAGE,S&P 500,NYMEX CRUDE OIL FUTURES,NYMEX RBOB GASOLINE FUTURES,stock_closing_usd
count,1282.0,1282.0,1282.0,1282.0,1282.0,1282.0,1282.0,1282.0,1282.0
mean,9851.123261,8017.955499,23835.983549,56.144727,747.272917,2719.345164,51.554938,1.582956,72.901217
std,1389.554011,1138.236432,3842.769203,12.53174,79.165564,464.221124,11.143953,0.312019,15.951972
min,6625.53,5466.87,15660.18,19.33,577.48,1829.08,-37.63,0.41,31.47
25%,9086.255,7187.5625,20825.6625,47.32,687.1675,2368.1425,45.33,1.4,69.15
50%,10064.395,8243.175,24742.87,55.905,731.115,2724.225,51.9,1.59,79.5
75%,10776.5025,8774.6375,26500.3675,64.9375,810.8075,2977.4,58.58,1.77,82.89
max,13630.55,10547.4,31961.86,86.29,960.89,3934.83,76.41,2.27,95.12


In [6]:
data_subset.columns

Index(['DOW JONES TRANSPORTATION AVERAGE', 'DOW JONES COMPOSITE AVERAGE',
       'DOW JONES INDUSTRIAL AVERAGE', 'ICE BRENT CRUDE OIL FUTURES',
       'DOW JONES UTILITY AVERAGE', 'S&P 500', 'NYMEX CRUDE OIL FUTURES',
       'NYMEX RBOB GASOLINE FUTURES', 'stock_closing_usd', 'Date'],
      dtype='object')

In [7]:
# normalize variables
X = preprocessing.normalize(data_subset[['DOW JONES TRANSPORTATION AVERAGE', 'DOW JONES COMPOSITE AVERAGE',
       'DOW JONES INDUSTRIAL AVERAGE', 'ICE BRENT CRUDE OIL FUTURES',
       'DOW JONES UTILITY AVERAGE', 'S&P 500', 'NYMEX CRUDE OIL FUTURES',
       'NYMEX RBOB GASOLINE FUTURES']])
X

array([[3.73642497e-01, 2.99331618e-01, 8.71471517e-01, ...,
        1.02278967e-01, 1.86806257e-03, 6.55549706e-05],
       [3.73910247e-01, 2.99602951e-01, 8.71244211e-01, ...,
        1.02400007e-01, 1.82640452e-03, 6.39774729e-05],
       [3.72153418e-01, 3.00056090e-01, 8.71798793e-01, ...,
        1.02629476e-01, 1.75169240e-03, 5.98164021e-05],
       ...,
       [3.73096930e-01, 2.88704605e-01, 8.74863585e-01, ...,
        1.07447306e-01, 1.73046487e-03, 5.20070112e-05],
       [3.72654184e-01, 2.88817819e-01, 8.75101921e-01, ...,
        1.06714914e-01, 1.77043524e-03, 5.26699606e-05],
       [3.76405909e-01, 2.88883355e-01, 8.73369669e-01, ...,
        1.07607106e-01, 1.73644097e-03, 5.30814476e-05]])

In [8]:
y = data_subset['stock_closing_usd']
y

2194    77.46
2195    78.12
2196    77.47
2197    76.23
2198    74.69
        ...  
4070    54.30
4071    55.05
4072    56.70
4073    55.76
4074    54.37
Name: stock_closing_usd, Length: 1282, dtype: float64

In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=8675309)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(961, 8) (321, 8) (961,) (321,)


In [10]:
LR = LinearRegression().fit(X_train, y_train)
DT = DecisionTreeRegressor().fit(X_train, y_train)
RF = RandomForestRegressor().fit(X_train, y_train)
GB = GradientBoostingRegressor().fit(X_train, y_train)
HGB = HistGradientBoostingRegressor().fit(X_train, y_train)
XGB = XGBRegressor().fit(X_train, y_train)
LGBM = LGBMRegressor().fit(X_train, y_train)
CB = CatBoostRegressor().fit(X_train, y_train)

ning: 570ms
616:	learn: 0.8895050	total: 916ms	remaining: 569ms
617:	learn: 0.8884334	total: 917ms	remaining: 567ms
618:	learn: 0.8873286	total: 918ms	remaining: 565ms
619:	learn: 0.8870899	total: 920ms	remaining: 564ms
620:	learn: 0.8859570	total: 921ms	remaining: 562ms
621:	learn: 0.8847718	total: 922ms	remaining: 560ms
622:	learn: 0.8840313	total: 923ms	remaining: 559ms
623:	learn: 0.8835356	total: 924ms	remaining: 557ms
624:	learn: 0.8829541	total: 925ms	remaining: 555ms
625:	learn: 0.8819185	total: 927ms	remaining: 554ms
626:	learn: 0.8813329	total: 928ms	remaining: 552ms
627:	learn: 0.8804734	total: 929ms	remaining: 551ms
628:	learn: 0.8794041	total: 931ms	remaining: 549ms
629:	learn: 0.8787543	total: 932ms	remaining: 547ms
630:	learn: 0.8774188	total: 933ms	remaining: 546ms
631:	learn: 0.8762291	total: 935ms	remaining: 544ms
632:	learn: 0.8745759	total: 936ms	remaining: 543ms
633:	learn: 0.8735642	total: 937ms	remaining: 541ms
634:	learn: 0.8730136	total: 938ms	remaining: 539ms


In [11]:
# return R2
LR.score(X, y)
DT.score(X, y)
RF.score(X, y)
GB.score(X,y)
HGB.score(X,y)
XGB.score(X,y)
LGBM.score(X,y)
CB.score(X,y)

0.9964278055629735

In [12]:
y_pred = LR.predict(X_test)
mean_squared_error(y_test, y_pred)

19.627296658602503

In [13]:
y_pred = DT.predict(X_test)
mean_squared_error(y_test, y_pred)

5.223798753894081

In [14]:
y_pred = RF.predict(X_test)
mean_squared_error(y_test, y_pred)

2.8317493208722744

In [15]:
y_pred = GB.predict(X_test)
mean_squared_error(y_test, y_pred)

4.255753058143

In [16]:
y_pred = HGB.predict(X_test)
mean_squared_error(y_test, y_pred)

3.3041965458656235

In [17]:
y_pred = XGB.predict(X_test)
mean_squared_error(y_test, y_pred)

3.5388052694426446

In [18]:
y_pred = LGBM.predict(X_test)
mean_squared_error(y_test, y_pred)

3.3528027925203165

In [19]:
y_pred = CB.predict(X_test)
mean_squared_error(y_test, y_pred)

2.3778247131616936

In [20]:
import numpy as np
pd.DataFrame(np.array([y_test, y_pred]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,311,312,313,314,315,316,317,318,319,320
0,76.36,81.36,42.0,49.24,83.01,80.6,68.5,68.91,67.19,33.74,...,36.49,82.19,82.42,83.83,80.86,82.9,55.05,75.74,86.84,75.37
1,76.710763,81.485448,42.831874,45.434723,82.040642,80.74999,69.266482,69.271216,67.852414,34.26499,...,39.324425,82.147277,82.150494,86.736474,80.389033,83.069709,53.814115,76.405996,86.288367,75.812257
