In [None]:
# !pip install -q hvplot
# !pip install --upgrade pandas
# !pip install --upgrade pandas-datareader
# !pip install -q yfinance
# !pip install catboost
# !pip install xgboost
# !pip install ThymeBoost

In [None]:
# !cat /proc/meminfo

In [None]:
# !cat /proc/cpuinfo

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

# For reading stock data from yahoo
from pandas_datareader.data import DataReader

# Pre-processing
from ThymeBoost import ThymeBoost as tb
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from scipy.stats.mstats import winsorize

# Forecasting method
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import AdaBoostRegressor

# Folding
from sklearn.model_selection import TimeSeriesSplit

# Lain-lain
from scipy.stats import kurtosis, skew
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [None]:
# The tech stocks we'll use for this analysis
tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN']

# Set up End and Start times for data grab
tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN']

end = '2022-01-01'
start = '1992-01-01'

for stock in tech_list:
    globals()[stock] = yf.download(stock, start, end)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:
company_list = [AAPL, GOOG, MSFT, AMZN]
company_name = ["APPLE", "GOOGLE", "MICROSOFT", "AMAZON"]

for company, com_name in zip(company_list, company_name):
    company["company_name"] = com_name
    
all = pd.concat(company_list, axis=0)

In [None]:
lgbm_params = {
  'nthread': 20,
  'max_depth': 1,
  'bagging_freq': 10,
  'bagging_fraction': 0.95,
  'n_estimators': 1000,
  'learning_rate': 0.01,
  'boosting_type': 'rf',
  'objective': 'regression_l1',
  }

In [None]:
xgb_params = {
    'max_depth':1,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'n_estimators': 1000
}

In [None]:
class Prediction:
  def __init__(self, train_x, test_x, train_y, test_y, val_x, val_y):
    self.train_x = train_x
    self.train_y = train_y
    self.test_x = test_x
    self.test_y = test_y
    self.val_x = val_x
    self.val_y = val_y
    self.prediction = []

  def lgbm(self):
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(self.train_x, self.train_y, eval_set=[(self.val_x, self.val_y)], early_stopping_rounds=50)
    prediction = model.predict(self.test_x)

    return prediction

  def catboost(self):
    model =  CatBoostRegressor(n_estimators=1000)
    model.fit(self.train_x, self.train_y, eval_set=(self.val_x, self.val_y), early_stopping_rounds=50)
    prediction = model.predict(self.test_x)

    return prediction

  def xgboost(self):
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(self.train_x, self.train_y, eval_set=[(self.val_x, self.val_y)], early_stopping_rounds=50)
    prediction = model.predict(self.test_x)
    
    return prediction


In [None]:
class GetRidofOutlier:
  def __init__(self, df, col):
    self.df = df
    self.col = col

  def tukeys_method(self):
    q1 = self.df[self.col].quantile(0.25)
    q3 = self.df[self.col].quantile(0.75)
    iqr = q3-q1
    inner_fence = 1.5*iqr
    outer_fence = 3*iqr
    
    inner_fence_le = q1-inner_fence
    inner_fence_ue = q3+inner_fence

    outer_fence_le = q1-outer_fence
    outer_fence_ue = q3+outer_fence
    
    outliers_prob = []
    outliers_poss = []

    for index, x in enumerate(self.df[self.col]):
        if x <= outer_fence_le or x >= outer_fence_ue:
            outliers_prob.append(index)
    for index, x in enumerate(self.df[self.col]):
        if x <= inner_fence_le or x >= inner_fence_ue:
            outliers_poss.append(index)

    return outliers_prob, outliers_poss

  def robust(self):
    model = RobustScaler(
        with_centering=True, 
        with_scaling=True, 
        quantile_range=(25.0, 75.0), 
        copy=True)
    
    self.df.set_index('Date', inplace=True) 
    robust_df = model.fit_transform(self.df.values)
    robust_df = pd.DataFrame(robust_df, columns = self.df.columns )
    
    return robust_df

  def standard(self):
    scaler = StandardScaler()
    # self.df.set_index('Date', inplace=True) 
    standard_df = scaler.fit_transform(self.df.values)
    standard_df = pd.DataFrame(standard_df, columns = self.df.columns)

    return standard_df
    
  def minmax(self):
    scaler = MinMaxScaler()
    # self.df.set_index('Date', inplace=True) 
    minmax_df = scaler.fit_transform(self.df.values)
    minmax_df = pd.DataFrame(minmax_df, columns = self.df.columns)

    return minmax_df

  def fences(self):    
    q1 = self.df[self.col].quantile(0.25)
    q3 = self.df[self.col].quantile(0.75)
    iqr = q3-q1
    outer_fence = 3*iqr
    outer_fence_le = q1-outer_fence
    outer_fence_ue = q3+outer_fence

    return outer_fence_le, outer_fence_ue

  def winsor(self):
    df_win = self.df.copy(deep=True)

    df_win['Close_95%'] = winsorize(self.df[self.col], limits=(0, 0.05))
    df_win['Close_975%'] = winsorize(self.df[self.col], limits=(0, 0.075))

    return df_win 

  def outliers_to_nan(self, probable_outliers):
    self.df.Close.iloc[probable_outliers] = None

    return self.df
  
  def thyme_outlier(self):
    model = tb.ThymeBoost()
    output = model.detect_outliers(
        self.df[self.col], 
        trend_estimator='linear', 
        seasonal_estimator='fourier', 
        global_cost='maicc', 
        fit_type='global', 
        seasonality_weights='regularize')
    output = output.rename(columns={'y':'Close'})

    return output

  def impute(self):
    imputer = IterativeImputer(
        estimator=BayesianRidge(), 
        initial_strategy='mean', 
        n_nearest_features=None, 
        imputation_order='ascending')
    # self.df.set_index('Date', inplace=True)  
    imputer.fit(self.df)
    df_imp_tf = imputer.transform(self.df)
    df_imp = pd.DataFrame(df_imp_tf, columns = self.df.columns)

    return df_imp

In [None]:
class Meong:
  def __init__(self, df, ratio):
    self.df = df
    self.ratio = ratio
    self.a = []
    self.b = []

  def data_split(self):
    data = self.df
    n = int(len(data)*(1 - self.ratio))

    return data[:n], data[n:]

  def pred(self):
    col = [i for i in self.df.columns if i not in self.df.index]
    y = 'Close'
    
    train_x, test_x, train_y, test_y = train_test_split(self.df[col], self.df[y], test_size=self.ratio, random_state=41)
    train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.25, random_state=41) 
  
    predict = Prediction(train_x, test_x, train_y, test_y, val_x, val_y)

    return predict

In [None]:
def acc(a, b):
  smape = round(np.mean(np.abs(a - b) / ((np.abs(a) + np.abs(b))/2))*100, 2)
  rmse = mean_squared_error(b, a, squared=False)
  mae = mean_absolute_error(b, a)

  return smape, rmse, mae

In [None]:
diff = np.sqrt(AAPL.loc[:, AAPL.columns != 'company_name']).diff().dropna()
diff.reset_index(inplace=True)

In [None]:
imp = GetRidofOutlier(diff, 'Close')

# # Scaler
# rob = imp.robust()
# stand = imp.standard()
# minmax = imp.minmax()

# Outlier handling
outlier = imp.thyme_outlier()
poss, prob = imp.tukeys_method()

# # Impute outlier
# nan_val = imp.outliers_to_nan(poss)

In [None]:
# imp_val = GetRidofOutlier(nan_val, 'Close')
# meong = imp_val.impute()

In [None]:
mew = outlier[outlier['outliers'] != True]
mew = mew.loc[:, mew.columns != 'outliers']

# diff.drop(index=diff.iloc[prob].index.tolist(), inplace=True)

In [None]:
# for i, company in enumerate(company_list, 1):
print(f"Kurtosis = {kurtosis(diff.Close)}")
print(f"Skewness = {skew(diff.Close)}")
print('\n')

Kurtosis = 26.66757703012294
Skewness = -0.24866487024295647




In [None]:
df_log = np.sqrt(AAPL['Close'])
df_diff = df_log.diff().dropna()

In [None]:
# for i, company in enumerate(company_list, 1):
adf = adfuller(df_diff, regression='ct')
print(f'ADF Statistic: {adf[0]}')
print('p-value: %.50f' % adf[1])
print(f'Critical Values: {adf[4]}')
print('\n')

kpss = kpss(df_diff, regression='ct')
print(f'KPSS Statistic: {kpss[0]}')
print('p-value: %f' % kpss[1])
print(f'Critical Values: {kpss[3]}')
print('\n')

ADF Statistic: -14.310691100528855
p-value: 0.00000000000000000000060173467569840964474096374744
Critical Values: {'1%': -3.959974372698332, '5%': -3.4110739896805993, '10%': -3.127393899246974}


KPSS Statistic: 0.1452810706670614
p-value: 0.051331
Critical Values: {'10%': 0.119, '5%': 0.146, '2.5%': 0.176, '1%': 0.216}




In [None]:
# plt.figure(figsize=(16,10))
# sns.distplot(diff.Close)
# # AAPL.Close.tail(365).plot()

In [None]:
# plt.figure(figsize=(16,6))
# # sns.distplot(df_diff.tail(365))
# sns.boxplot(diff.Close)

In [None]:
AAPL

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,company_name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1992-01-02,0.497768,0.533482,0.495536,0.531250,0.433689,233632000,APPLE
1992-01-03,0.535714,0.537946,0.520089,0.526786,0.430045,190254400,APPLE
1992-01-06,0.524554,0.526786,0.515625,0.517857,0.422755,114240000,APPLE
1992-01-07,0.513393,0.531250,0.513393,0.527902,0.430956,141467200,APPLE
1992-01-08,0.522321,0.546875,0.522321,0.540179,0.440978,232747200,APPLE
...,...,...,...,...,...,...,...
2021-12-27,177.089996,180.419998,177.070007,180.330002,179.836319,74919600,APPLE
2021-12-28,180.160004,181.330002,178.529999,179.289993,178.799164,79144300,APPLE
2021-12-29,179.330002,180.630005,178.139999,179.380005,178.888916,62348900,APPLE
2021-12-30,179.470001,180.570007,178.089996,178.199997,177.712143,59773000,APPLE


In [None]:
pred = Meong(mew, 0.2)
train, test = pred.data_split()
predict = pred.pred()

In [None]:
smape, rmse, mae = acc(test.Close, predict.lgbm())
print('Test sMAPE: %.5f' % smape)
print('Test MAE: %.5f' % mae)
print('Test RMSE: %.5f' % rmse)
print('\n')

[1]	valid_0's l1: 0.00902765
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l1: 0.00902765
[3]	valid_0's l1: 0.00902765
[4]	valid_0's l1: 0.00902765
[5]	valid_0's l1: 0.00902765
[6]	valid_0's l1: 0.00902765
[7]	valid_0's l1: 0.00902765
[8]	valid_0's l1: 0.00902765
[9]	valid_0's l1: 0.00902765
[10]	valid_0's l1: 0.00902765
[11]	valid_0's l1: 0.00902737
[12]	valid_0's l1: 0.00902715
[13]	valid_0's l1: 0.00902696
[14]	valid_0's l1: 0.0090268
[15]	valid_0's l1: 0.00902667
[16]	valid_0's l1: 0.00902655
[17]	valid_0's l1: 0.00902644
[18]	valid_0's l1: 0.00902635
[19]	valid_0's l1: 0.00902627
[20]	valid_0's l1: 0.0090262
[21]	valid_0's l1: 0.00902613
[22]	valid_0's l1: 0.00902607
[23]	valid_0's l1: 0.00902601
[24]	valid_0's l1: 0.00902596
[25]	valid_0's l1: 0.00902591
[26]	valid_0's l1: 0.00902587
[27]	valid_0's l1: 0.00902583
[28]	valid_0's l1: 0.00902579
[29]	valid_0's l1: 0.00902576
[30]	valid_0's l1: 0.00902573
[31]	valid_0's l1: 0.00902577
[32]	valid_0's l1: 

In [None]:
smape, rmse, mae = acc(test.Close, predict.catboost())
# print(company.company_name[1])
print('Test sMAPE: %.10f' % smape)
print('Test MAE: %.10f' % mae)
print('Test RMSE: %.10f' % rmse)
print('\n')

Learning rate set to 0.063521
0:	learn: 0.0164666	test: 0.0167961	best: 0.0167961 (0)	total: 47.4ms	remaining: 47.4s
1:	learn: 0.0154933	test: 0.0157929	best: 0.0157929 (1)	total: 49.1ms	remaining: 24.5s
2:	learn: 0.0145894	test: 0.0148686	best: 0.0148686 (2)	total: 51.2ms	remaining: 17s
3:	learn: 0.0137493	test: 0.0140019	best: 0.0140019 (3)	total: 53.6ms	remaining: 13.3s
4:	learn: 0.0129510	test: 0.0131750	best: 0.0131750 (4)	total: 55.6ms	remaining: 11.1s
5:	learn: 0.0121946	test: 0.0124066	best: 0.0124066 (5)	total: 57.6ms	remaining: 9.55s
6:	learn: 0.0114736	test: 0.0116724	best: 0.0116724 (6)	total: 59.1ms	remaining: 8.38s
7:	learn: 0.0108175	test: 0.0110004	best: 0.0110004 (7)	total: 60.5ms	remaining: 7.5s
8:	learn: 0.0101985	test: 0.0103705	best: 0.0103705 (8)	total: 61.8ms	remaining: 6.81s
9:	learn: 0.0096104	test: 0.0097584	best: 0.0097584 (9)	total: 63.3ms	remaining: 6.27s
10:	learn: 0.0090706	test: 0.0092141	best: 0.0092141 (10)	total: 64.8ms	remaining: 5.83s
11:	learn: 0.0

In [None]:
smape, rmse, mae = acc(test.Close, predict.xgboost())
# print(company.company_name[1])
print('Test sMAPE: %.10f' % smape)
print('Test MAE: %.10f' % mae)
print('Test RMSE: %.10f' % rmse)
print('\n')

[0]	validation_0-rmse:0.449382
Will train until validation_0-rmse hasn't improved in 50 rounds.
[1]	validation_0-rmse:0.40455
[2]	validation_0-rmse:0.364186
[3]	validation_0-rmse:0.327793
[4]	validation_0-rmse:0.295043
[5]	validation_0-rmse:0.265597
[6]	validation_0-rmse:0.23908
[7]	validation_0-rmse:0.215245
[8]	validation_0-rmse:0.193748
[9]	validation_0-rmse:0.174444
[10]	validation_0-rmse:0.157032
[11]	validation_0-rmse:0.141405
[12]	validation_0-rmse:0.127308
[13]	validation_0-rmse:0.114632
[14]	validation_0-rmse:0.103219
[15]	validation_0-rmse:0.09297
[16]	validation_0-rmse:0.083729
[17]	validation_0-rmse:0.075402
[18]	validation_0-rmse:0.067936
[19]	validation_0-rmse:0.061194
[20]	validation_0-rmse:0.055114
[21]	validation_0-rmse:0.049669
[22]	validation_0-rmse:0.044762
[23]	validation_0-rmse:0.040354
[24]	validation_0-rmse:0.036369
[25]	validation_0-rmse:0.032802
[26]	validation_0-rmse:0.029595
[27]	validation_0-rmse:0.026716
[28]	validation_0-rmse:0.024116
[29]	validation_0-rm

In [None]:
# smape, rmse, mae = acc(test.Close, ada)
# # print(company.company_name[1])
# print('Test sMAPE: %.3f' % smape)
# print('Test MAE: %.3f' % mae)
# print('Test RMSE: %.3f' % rmse)
# print('\n')