In [179]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [2]:
df_train = pd.read_csv('option_train.csv')
df_test = pd.read_csv('option_test_wolabel.csv')

In [385]:
df_train

Unnamed: 0,Value,S,K,tau,r,BS
0,21.670404,431.623898,420,0.341270,0.03013,0
1,0.125000,427.015526,465,0.166667,0.03126,1
2,20.691244,427.762336,415,0.265873,0.03116,0
3,1.035002,451.711658,460,0.063492,0.02972,1
4,39.553020,446.718974,410,0.166667,0.02962,0
...,...,...,...,...,...,...
1675,1.470000,444.397163,475,0.273810,0.03034,1
1676,34.927317,438.453825,410,0.301587,0.02972,0
1677,41.127997,428.042219,390,0.170635,0.03188,0
1678,0.440003,439.081203,480,0.293651,0.02962,1


#### Regression Dataset

In [3]:
df_train.loc[df_train['BS']=='Under','BS']=0
df_train.loc[df_train['BS']=='Over','BS']=1
X_train = df_train.iloc[:, 1:5]
y_train = df_train.iloc[:, 0]

In [6]:
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)

In [7]:
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)

##### Model Performance

In [131]:
models = [LinearRegression(),KNeighborsRegressor(),DecisionTreeRegressor(), RandomForestRegressor(),xgb.XGBRegressor()]

In [132]:
list = ['LinearRegression','KNN','DecisionTree','RandomForest','XGBoost']

In [186]:
kfolds10_regresssion = KFold(n_splits=10, random_state=0, shuffle=True)

In [187]:
kfolds5_regresssion = KFold(n_splits=5, random_state=0, shuffle=True)

In [188]:
result = []
for model in models:
    value1 = np.mean(cross_val_score(model.fit(X_train_norm,y_train), X_train_norm,
                        y_train,cv=kfolds10_regresssion))
    value2 = np.mean(cross_val_score(model.fit(X_train_norm,y_train), X_train_norm,
                        y_train,cv=kfolds5_regresssion))
    value3 = np.mean(cross_val_score(model.fit(X_train_std,y_train), X_train_norm,
                        y_train,cv=kfolds10_regresssion))
    value4 = np.mean(cross_val_score(model.fit(X_train_std,y_train), X_train_norm,
                        y_train,cv=kfolds5_regresssion))
    value5 = np.mean(cross_val_score(model.fit(X_train.values,y_train), X_train_norm,
                        y_train,cv=kfolds10_regresssion))
    value6 = np.mean(cross_val_score(model.fit(X_train.values,y_train), X_train_norm,
                        y_train,cv=kfolds5_regresssion))
    result.append({'Model': model,
                           '10 folds Norm': value1,
                            '5 folds Norm':value2,
                            '10 folds Std': value3,
                            '5 folds Std':value4,
                            '10 folds':value5,
                            '5 folds':value6})
model_performance = pd.DataFrame(result)

In [189]:
model_performance.iloc[:,0] = ['Linear Regression','KNN','Decision Tree','Random Forest',
                              'XGBoost']
model_performance

Unnamed: 0,Model,10 folds Norm,5 folds Norm,10 folds Std,5 folds Std,10 folds,5 folds
0,Linear Regression,0.910783,0.911024,0.910783,0.911024,0.910783,0.911024
1,KNN,0.973689,0.971801,0.973689,0.971801,0.973689,0.971801
2,Decision Tree,0.992295,0.99124,0.992327,0.991318,0.992094,0.991357
3,Random Forest,0.996324,0.995986,0.996373,0.995878,0.996361,0.995903
4,XGBoost,0.998208,0.997947,0.998208,0.997947,0.998208,0.997947


#### GridSearch for XGboost

In [371]:
gbm_param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),
     'n_estimators':[1000,1200],
     'max_depth': [2,3,4]
}

In [372]:
XGB_reg = xgb.XGBRegressor()
XGB_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [373]:
grid_search = GridSearchCV(estimator = XGB_reg, param_grid = gbm_param_grid, scoring = 'neg_mean_squared_error', cv = 10, verbose = 1)

In [374]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0, gpu_id=-1,
                                    importance_type='gain',
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', validate_parameters=1,
                                    verbosity=None),
     

In [375]:
grid_search.best_params_

{'colsample_bytree': 0.8, 'max_depth': 3, 'n_estimators': 1200}

In [376]:
final_prediction_model = grid_search.best_estimator_

In [377]:
X_test = df_test.iloc[:, 0:5]
X_test

Unnamed: 0,S,K,tau,r
0,431.618600,460,0.293651,0.03147
1,432.633296,420,0.182540,0.03147
2,432.633296,430,0.182540,0.03147
3,431.618600,415,0.293651,0.03147
4,434.772855,420,0.043651,0.03147
...,...,...,...,...
1115,440.067417,435,0.182540,0.02962
1116,439.081203,485,0.293651,0.02962
1117,439.081203,475,0.293651,0.02962
1118,442.490015,420,0.043651,0.02962


In [378]:
final_reg_predictions = final_prediction_model.predict(df_test)
final_reg_predictions

array([ 1.4864992, 17.880358 , 10.839184 , ...,  1.1317836, 24.381945 ,
       16.417355 ], dtype=float32)

In [379]:
final_prediction = pd.DataFrame(final_reg_predictions,
             columns=['Value'])

In [380]:
final_prediction

Unnamed: 0,Value
0,1.486499
1,17.880358
2,10.839184
3,24.284557
4,15.704806
...,...
1115,12.077401
1116,0.588863
1117,1.131784
1118,24.381945


In [381]:
value = XGB_reg.predict(df_test)
final = pd.DataFrame(value,
             columns=['Value'])
final

Unnamed: 0,Value
0,1.544463
1,17.378826
2,11.381046
3,23.616793
4,15.928064
...,...
1115,12.127989
1116,0.299507
1117,0.673628
1118,24.258938


#### RMSE

In [382]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'],cvres['params']):
    print(np.sqrt(-mean_score),params)

0.5266189762985947 {'colsample_bytree': 0.5, 'max_depth': 2, 'n_estimators': 1000}
0.504594949755119 {'colsample_bytree': 0.5, 'max_depth': 2, 'n_estimators': 1200}
0.45787960355245777 {'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 1000}
0.44934549071377977 {'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 1200}
0.46181123537797863 {'colsample_bytree': 0.5, 'max_depth': 4, 'n_estimators': 1000}
0.4553844549437876 {'colsample_bytree': 0.5, 'max_depth': 4, 'n_estimators': 1200}
0.5266189762985947 {'colsample_bytree': 0.6, 'max_depth': 2, 'n_estimators': 1000}
0.504594949755119 {'colsample_bytree': 0.6, 'max_depth': 2, 'n_estimators': 1200}
0.45787960355245777 {'colsample_bytree': 0.6, 'max_depth': 3, 'n_estimators': 1000}
0.44934549071377977 {'colsample_bytree': 0.6, 'max_depth': 3, 'n_estimators': 1200}
0.46181123537797863 {'colsample_bytree': 0.6, 'max_depth': 4, 'n_estimators': 1000}
0.4553844549437876 {'colsample_bytree': 0.6, 'max_depth': 4, 'n_estimators': 1200}


In [386]:
xgb_reg = RF_reg.fit(X_train,y_train)
xgv_cv = cross_val_score(xgb_reg,X_train,y_train, scoring='neg_mean_squared_error',cv=kfolds10_regresssion)
score = np.sqrt(-xgv_cv)

In [387]:
score.mean()

0.8215915573354072