In [None]:
# XGBoost is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library

In [None]:
%pip install scikit-learn xgboost

In [121]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objs as go

In [124]:
# Import Google 10 years closing data
his_data = yf.download("GOOG", start="2011-01-01", end="2022-1-1")
ten_yr_closing=his_data['Close']
df=ten_yr_closing.to_frame()
df.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2010-12-31,14.793799
2011-01-03,15.05233
2011-01-04,14.996788
2011-01-05,15.16989
2011-01-06,15.280226


In [126]:
# Create a targeted column with shift 1
# As the closing price for the next day is the target for the prediction from up to the current date
# X= Close, y=Target
df['Target']=df.Close.shift(-1)
df.dropna(inplace=True) # Remove the last row of data which target value =Nan
df.head()

Unnamed: 0_level_0,Close,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-12-31,14.793799,15.05233
2011-01-03,15.05233,14.996788
2011-01-04,14.996788,15.16989
2011-01-05,15.16989,15.280226
2011-01-06,15.280226,15.353451


### Univariate Model with regression (Without hyperparameter tuning)

In [127]:
# Train Test Split 
def train_test_split(data:pd.DataFrame,perc_train:float,perc_test:float) -> (np.array, np.array):
    data=data.values # Turn into np.array, Remove col name
    num_train=int(len(data)*perc_train)
    num_test=len(data)-num_train
    return data[:num_train,:],data[num_train:,:]

In [128]:
# Model Training with Prediction
def uni_model_pred (train_data:np.array, test_x:np.array) -> float: # Univariate model with built in optimisation for regression
    
    train_X=train_data[:,:-1]
    train_y=train_data[:,-1]
    from xgboost import XGBRegressor
    model=XGBRegressor(objective="reg:squarederror") 
    model.fit(train_X,train_y) # Optimal model is generated with min requrederror

    # Predict
    test_x=np.array(test_x).reshape(1,-1)

    return pred[0]


In [133]:
def rolling_validate(train_data:np.array, test_data:np.array):
    ''' 
    This function aims at accumulating training data for each prediction 
    ie. The last prediction is based on the model trained by data from the beginning up until the second last data point
    '''
    prediction=[] 
    rolling_data=[x for x in train_data]
    
    for i in range(len(test_data)):
        pred=uni_model_pred (rolling_data, test_data[i][0]) 
        prediction.append(pred)
        rolling_data.append(test_data[i])
    
    #Evaluate
    from sklearn.metrics import mean_absolute_error
    print('test_data',test_data)
    mse=mean_absolute_error(test_data[:,-1],prediction)

    return mse,test_data[:,-1],prediction

### Multivariate (With hyperparameter tunning)

In [None]:
# Train Validation Test Split [!!! np.array]
def train_valid_test_split(data,perc_train,perc_valid,perc_test):
    num_train=int(len(data)*perc_train)
    num_valid=int(len(data)*perc_valid)
    num_test=len(data)-num_train-num_valid
    return data.iloc[:num_train,:],data.iloc[num_train:num_train+num_valid,:],data.iloc[num_train+num_valid:,:]