In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read data
df = pd.read_csv('option_train.csv')
df.head()

In [None]:
df.groupby('BS')['S'].count()

# Data Cleaning

In [None]:
df.isnull().sum()

In [None]:
# change BS to dummy
df['BS'] = [1 if i == 'Over' else 0 for i in df['BS']]
df.head()

In [None]:
df.shape

In [None]:
# drop the row with null values
df = df.dropna(axis=0)
df.shape

In [None]:
# remove outliers
df = df[df['tau'] != 250]
df = df[df['tau'] != 146]
df = df[df['S'] != 0]
df.shape

# Version 1

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify = y,  random_state=20)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

In [None]:
print(scaled_X_train.shape)
print(y_train.shape)

# Test Models

In [None]:
from sklearn.model_selection import cross_val_score,KFold ## for regression
from sklearn.model_selection import StratifiedKFold ## recommended for classification

In [None]:
# importing the modules
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, 
f1_score, confusion_matrix, classification_report
from sklearn.utils import class_weight
from sklearn.neural_network import MLPClassifier

### Logistic Regression

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
log_param = {'penalty' : ['l1','l2'], 
             'C': np.logspace(-3,3,7),
             'solver'  : ['newton-cg', 'lbfgs', 'liblinear']}

log = LogisticRegression()
log_cv = GridSearchCV(log, log_param, cv = kf5, refit=True, verbose=3)
log_cv.fit(X_train, y_train)
print(log_cv.best_score_)
print(log_cv.best_params_)

In [None]:
log = LogisticRegression(C = 10, penalty = 'l1', solver= 'liblinear')
cv = cross_val_score(log, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

log.fit(X_train, y_train)
y_pred = log.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(log.score(X_test, y_test))

### Random Forest

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
rf_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': np.arange(50, 201, 50)}

param = {'alpha':np.arange(0,1,0.1)}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, rf_param, cv=kf5, refit=True, verbose=3) 
rf_cv.fit(X_train, y_train)
print(rf_cv.best_score_)
print(rf_cv.best_params_)

In [None]:
rf = RandomForestClassifier(max_depth= 9, min_samples_leaf= 2, 
                            min_samples_split= 5, n_estimators= 200)
cv = cross_val_score(rf, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(rf.score(X_test, y_test))

### Gradient Boosting

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
gb_param = {
    "learning_rate": [0.001, 0.01, 0.1],
    'max_depth': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    "n_estimators":[10, 100]
    }

gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, gb_param, cv=kf5, refit=True, verbose=3) 
gb_cv.fit(X_train, y_train)
print(gb_cv.best_score_)
print(gb_cv.best_params_)

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.1, max_depth= 5, 
                                min_samples_leaf= 2, min_samples_split= 2, 
                                n_estimators= 100)
cv = cross_val_score(gb, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(gb.score(X_test, y_test))

### SVM

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
svm_param = {'C': [0.1, 1, 10, 100, 1000],  
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'kernel': ['rbf']}

X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
svm = SVC()
svm_cv = GridSearchCV(svm, svm_param, cv=kf5, refit=True, verbose=3) 
svm_cv.fit(X_train, y_train)
print(svm_cv.best_score_)
print(svm_cv.best_params_)

In [None]:
svm = SVC(C=1000, gamma=0.001, kernel='rbf')
cv = cross_val_score(svm, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(svm.score(X_test, y_test))

### Xgboost

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
kb_param = {'min_child_weight': [1, 5, 10],
          'gamma': [0.5, 1, 1.5, 2, 5],
          'subsample': [0.6, 0.8, 1.0],
          'colsample_bytree': [0.6, 0.8, 1.0],
          'max_depth': [3, 4, 5]}


X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
kb = XGBClassifier()
kb_cv = GridSearchCV(kb, kb_param, cv=kf5, refit=True, verbose=3) 
kb_cv.fit(X_train, y_train)
print(kb_cv.best_score_)
print(kb_cv.best_params_)

In [None]:
kb = XGBClassifier(colsample_bytree = 1.0, gamma = 2, max_depth = 3, 
                   min_child_weight = 1, subsample = 0.8)
cv = cross_val_score(kb, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

kb.fit(X_train, y_train)
y_pred = kb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(kb.score(X_test, y_test))

### KNN

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
knn_param = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_param, cv=kf5, refit=True, verbose=3) 
knn_cv.fit(X_train, y_train)
print(knn_cv.best_score_)
print(knn_cv.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
cv = cross_val_score(knn, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(knn.score(X_test, y_test))

### Neural Network

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
nn_param = {'learning_rate': ["constant", "invscaling", "adaptive"],
            'hidden_layer_sizes': [(100,1), (100,2), (100,3)],
            'activation': ["logistic", "relu", "Tanh"]}

nn = MLPClassifier()
nn_cv = GridSearchCV(nn, nn_param, cv=kf5, refit=True, verbose=3) 
nn_cv.fit(X_train, y_train)
print(nn_cv.best_score_)
print(nn_cv.best_params_)

In [None]:
nn = MLPClassifier(activation = 'logistic', hidden_layer_sizes = (100, 2), learning_rate = 'invscaling')
cv = cross_val_score(nn, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

nn.fit(X_train, y_train)
y_pred = nn.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(nn.score(X_test, y_test))

### Decision Tree

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
dt_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10]}

dt = DecisionTreeClassifier()
dt_cv = GridSearchCV(dt, dt_param, cv=kf5, refit=True, verbose=3) 
dt_cv.fit(X_train, y_train)
print(dt_cv.best_score_)
print(dt_cv.best_params_)

In [None]:
dt = DecisionTreeClassifier(max_depth = 9, min_samples_leaf = 1, min_samples_split = 2)
cv = cross_val_score(dt, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(dt.score(X_test, y_test))

### LGB

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
lgb_param = {'max_depth': np.arange(2, 11),
            'n_estimators': np.arange(50, 201, 50),
            'lambda_l1': [0, 1, 1.5],
            'lambda_l2': [0, 1]}

LGB = lgb.LGBMClassifier()
lgb_cv = GridSearchCV(LGB, lgb_param, cv=kf5, refit=True, verbose=3) 
lgb_cv.fit(X_train, y_train)
print(lgb_cv.best_score_)
print(lgb_cv.best_params_)

In [None]:
LGB = lgb.LGBMClassifier(lambda_l1=0, lambda_l2=1, max_depth=9, n_estimators=200)
cv = cross_val_score(LGB, X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

LGB.fit(X_train, y_train)
y_pred = LGB.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(LGB.score(X_test, y_test))

In [None]:
classifiers = [LogisticRegression(C = 10, penalty = 'l1', solver= 'liblinear'),
               KNeighborsClassifier(n_neighbors=7),
               SVC(C=1000, gamma=0.001, kernel='rbf'),
               DecisionTreeClassifier(max_depth=9, min_samples_leaf=1, min_samples_split=2),
               RandomForestClassifier(max_depth=9, min_samples_leaf=2, min_samples_split=5, n_estimators=200),
               GradientBoostingClassifier(learning_rate=0.1, max_depth=5, min_samples_leaf=2,
                                          min_samples_split=2, n_estimators=100),
               MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 2), learning_rate='invscaling'),
               XGBClassifier(colsample_bytree=1.0, gamma=2, max_depth=3, min_child_weight=1, subsample=0.8),
               lgb.LGBMClassifier(lambda_l1=0, lambda_l2=1, max_depth=9, n_estimators=200)]

In [None]:
acc = []
pre = []
rec = []
f1 = []

for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    acc.append(accuracy_score(y_test, y_pred))
    pre.append(precision_score(y_test, y_pred, average='weighted'))
    rec.append(recall_score(y_test, y_pred, average='weighted'))
    f1.append(f1_score(y_test, y_pred, average='weighted'))

In [None]:
models = pd.DataFrame({'Model': ['Logistic Regression','KNN', 'SVM', 'Decision Tree', 'Random Forest', 
                                 'Gradient Boosting','Neural Network','Xgboost', 'LGB'],
                       'Accuracy':acc,
                       'Precision':pre,
                       'Recall':rec,
                       'F1':f1})
models.sort_values(by='Accuracy', ascending=False)

# Standardize (Use scaled_X_train)

### Logistic Regression

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
log_param = {'penalty' : ['l1','l2'], 
             'C': np.logspace(-3,3,7),
             'solver'  : ['newton-cg', 'lbfgs', 'liblinear']}

log = LogisticRegression()
log_cv = GridSearchCV(log, log_param, cv = kf5, refit=True, verbose=3)
log_cv.fit(scaled_X_train, y_train)
print(log_cv.best_score_)
print(log_cv.best_params_)

In [None]:
log = LogisticRegression(C = 1, penalty = 'l2', solver= 'newton-cg')
cv = cross_val_score(log, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

log.fit(scaled_X_train, y_train)
y_pred = log.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(log.score(scaled_X_test, y_test))

### Xgboost

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
kb_param = {'min_child_weight': [1, 5, 10],
          'gamma': [0.5, 1, 1.5, 2, 5],
          'subsample': [0.6, 0.8, 1.0],
          'colsample_bytree': [0.6, 0.8, 1.0],
          'max_depth': [3, 4, 5]}


X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
kb = XGBClassifier()
kb_cv = GridSearchCV(kb, kb_param, cv=kf5, refit=True, verbose=3) 
kb_cv.fit(scaled_X_train, y_train)
print(kb_cv.best_score_)
print(kb_cv.best_params_)

In [None]:
kb = XGBClassifier(colsample_bytree = 0.8, gamma = 1, max_depth = 4, 
                   min_child_weight = 1, subsample = 0.6)
cv = cross_val_score(kb, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

kb.fit(scaled_X_train, y_train)
y_pred = kb.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(kb.score(scaled_X_test, y_test))

### Random Forest

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
rf_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': np.arange(50, 201, 50)}

param = {'alpha':np.arange(0,1,0.1)}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, rf_param, cv=kf5, refit=True, verbose=3) 
rf_cv.fit(scaled_X_train, y_train)
print(rf_cv.best_score_)
print(rf_cv.best_params_)

In [None]:
rf = RandomForestClassifier(max_depth= 10, min_samples_leaf= 2, 
                            min_samples_split= 5, n_estimators= 50)
cv = cross_val_score(rf, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

rf.fit(scaled_X_train, y_train)
y_pred = rf.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(rf.score(scaled_X_test, y_test))

### Gradient Boosting

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
gb_param = {
    "learning_rate": [0.001, 0.01, 0.1],
    'max_depth': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    "n_estimators":[10, 100]
    }

gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, gb_param, cv=kf5, refit=True, verbose=3) 
gb_cv.fit(scaled_X_train, y_train)
print(gb_cv.best_score_)
print(gb_cv.best_params_)

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.1, max_depth= 5, 
                                min_samples_leaf= 4, min_samples_split= 2, 
                                n_estimators= 100)
cv = cross_val_score(gb, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

gb.fit(scaled_X_train, y_train)
y_pred = gb.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(gb.score(scaled_X_test, y_test))

### SVM

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
svm_param = {'C': [0.1, 1, 10, 100, 1000],  
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'kernel': ['rbf']}

X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
svm = SVC()
svm_cv = GridSearchCV(svm, svm_param, cv=kf5, refit=True, verbose=3) 
svm_cv.fit(scaled_X_train, y_train)
print(svm_cv.best_score_)
print(svm_cv.best_params_)

In [None]:
svm = SVC(C=100, gamma=0.1, kernel='rbf')
cv = cross_val_score(svm, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

svm.fit(scaled_X_train, y_train)
y_pred = svm.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(svm.score(scaled_X_test, y_test))

### KNN

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
knn_param = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_param, cv=kf5, refit=True, verbose=3) 
knn_cv.fit(scaled_X_train, y_train)
print(knn_cv.best_score_)
print(knn_cv.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 9)
cv = cross_val_score(knn, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

knn.fit(scaled_X_train, y_train)
y_pred = knn.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(knn.score(scaled_X_test, y_test))

### Neural Network

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
nn_param = {'learning_rate': ["constant", "invscaling", "adaptive"],
            'hidden_layer_sizes': [(100,1), (100,2), (100,3)],
            'activation': ["logistic", "relu", "Tanh"]}

nn = MLPClassifier()
nn_cv = GridSearchCV(nn, nn_param, cv=kf5, refit=True, verbose=3) 
nn_cv.fit(scaled_X_train, y_train)
print(nn_cv.best_score_)
print(nn_cv.best_params_)

In [None]:
nn = MLPClassifier(activation = 'relu', hidden_layer_sizes = (100, 3), learning_rate = 'constant')
cv = cross_val_score(nn, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

nn.fit(scaled_X_train, y_train)
y_pred = nn.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(nn.score(scaled_X_test, y_test))

### Decision Tree

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
dt_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10]}

dt = DecisionTreeClassifier()
dt_cv = GridSearchCV(dt, dt_param, cv=kf5, refit=True, verbose=3) 
dt_cv.fit(scaled_X_train, y_train)
print(dt_cv.best_score_)
print(dt_cv.best_params_)

In [None]:
dt = DecisionTreeClassifier(max_depth = 9, min_samples_leaf = 2, min_samples_split = 2)
cv = cross_val_score(dt, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

dt.fit(scaled_X_train, y_train)
y_pred = dt.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(dt.score(scaled_X_test, y_test))

### LGB

In [None]:
kf5 = StratifiedKFold(n_splits = 5, shuffle = True)
lgb_param = {'max_depth': np.arange(2, 11),
            'n_estimators': np.arange(50, 201, 50),
            'lambda_l1': [0, 1, 1.5],
            'lambda_l2': [0, 1]}

LGB = lgb.LGBMClassifier()
lgb_cv = GridSearchCV(LGB, lgb_param, cv=kf5, refit=True, verbose=3) 
lgb_cv.fit(scaled_X_train, y_train)
print(lgb_cv.best_score_)
print(lgb_cv.best_params_)

In [None]:
LGB = lgb.LGBMClassifier(lambda_l1=1, lambda_l2=1, max_depth=4, n_estimators=150)
cv = cross_val_score(LGB, scaled_X_train, y_train, cv = kf5)
print(cv, 'mean: ', cv.mean())

LGB.fit(scaled_X_train, y_train)
y_pred = LGB.predict(scaled_X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(LGB.score(scaled_X_test, y_test))

### Final

In [None]:
classifiers = [LogisticRegression(C=1, penalty='l2', solver='newton-cg'),
               KNeighborsClassifier(n_neighbors=9),
               SVC(C=100, gamma=0.1, kernel='rbf'),
               DecisionTreeClassifier(max_depth=9, min_samples_leaf=2, min_samples_split=2),
               RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=50),
               GradientBoostingClassifier(learning_rate=0.1, max_depth=5, min_samples_leaf=4,
                                          min_samples_split=2, n_estimators=100),
               MLPClassifier(activation='relu', hidden_layer_sizes=(100, 3), learning_rate='constant'),
               XGBClassifier(colsample_bytree=0.8, gamma=1, max_depth=4, min_child_weight=1, subsample=0.6),
               lgb.LGBMClassifier(lambda_l1=1, lambda_l2=1, max_depth=4, n_estimators=150)]

In [None]:
acc = []
pre = []
rec = []
f1 = []

for classifier in classifiers:
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    acc.append(accuracy_score(y_test, y_pred))
    pre.append(precision_score(y_test, y_pred, average='weighted'))
    rec.append(recall_score(y_test, y_pred, average='weighted'))
    f1.append(f1_score(y_test, y_pred, average='weighted'))

In [None]:
models = pd.DataFrame({'Model': ['Logistic Regression','KNN', 'SVM', 'Decision Tree', 'Random Forest', 
                                 'Gradient Boosting','Neural Network','Xgboost', 'LGB'],
                       'Accuracy':acc,
                       'Precision':pre,
                       'Recall':rec,
                       'F1':f1})
models.sort_values(by='Accuracy', ascending=False)

# Version 2

In [None]:
from sklearn.preprocessing import StandardScaler

X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
print(scaled_X.shape)

In [None]:
from sklearn.model_selection import cross_val_score,KFold ## for regression
from sklearn.model_selection import StratifiedKFold ## recommended for classification

# importing the modules
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import class_weight
from sklearn.neural_network import MLPClassifier

### Logistic Regression

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
log_param = {'penalty' : ['l1','l2'], 
             'C': np.logspace(-3,3,7),
             'solver'  : ['newton-cg', 'lbfgs', 'liblinear']}

log = LogisticRegression()
log_cv = GridSearchCV(log, log_param, cv = kf10, refit=True, verbose=3)
log_cv.fit(scaled_X, y)
print(log_cv.best_score_)
print(log_cv.best_params_)

### Random Forest

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
rf_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': np.arange(50, 201, 50)}

param = {'alpha':np.arange(0,1,0.1)}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, rf_param, cv=kf10, refit=True, verbose=3) 
rf_cv.fit(scaled_X, y)
print(rf_cv.best_score_)
print(rf_cv.best_params_)

### Gradient Boosting

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
gb_param = {
    "learning_rate": [0.001, 0.01, 0.1],
    'max_depth': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    "n_estimators":[10, 100]
    }

gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, gb_param, cv=kf10, refit=True, verbose=3) 
gb_cv.fit(scaled_X, y)
print(gb_cv.best_score_)
print(gb_cv.best_params_)

### SVM

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
svm_param = {'C': [0.1, 1, 10, 100, 1000],  
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'kernel': ['rbf']}

X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
svm = SVC()
svm_cv = GridSearchCV(svm, svm_param, cv=kf10, refit=True, verbose=3) 
svm_cv.fit(scaled_X, y)
print(svm_cv.best_score_)
print(svm_cv.best_params_)

### Xgboost

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
kb_param = {'min_child_weight': [1, 5, 10],
          'gamma': [0.5, 1, 1.5, 2, 5],
          'subsample': [0.6, 0.8, 1.0],
          'colsample_bytree': [0.6, 0.8, 1.0],
          'max_depth': [3, 4, 5]}


X = df.loc[:, ['S', 'K', 'tau', 'r']]
y = df.loc[:, 'BS']
kb = XGBClassifier()
kb_cv = GridSearchCV(kb, kb_param, cv=kf10, refit=True, verbose=3) 
kb_cv.fit(scaled_X, y)
print(kb_cv.best_score_)
print(kb_cv.best_params_)

### KNN

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
knn_param = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_param, cv=kf10, refit=True, verbose=3) 
knn_cv.fit(scaled_X, y)
print(knn_cv.best_score_)
print(knn_cv.best_params_)

### Neural Network

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
nn_param = {'learning_rate': ["constant", "invscaling", "adaptive"],
            'hidden_layer_sizes': [(100,1), (100,2), (100,3)],
            'activation': ["logistic", "relu", "Tanh"]}

nn = MLPClassifier()
nn_cv = GridSearchCV(nn, nn_param, cv=kf10, refit=True, verbose=3) 
nn_cv.fit(scaled_X, y)
print(nn_cv.best_score_)
print(nn_cv.best_params_)

### Decision Tree

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
dt_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10]}

dt = DecisionTreeClassifier()
dt_cv = GridSearchCV(dt, dt_param, cv=kf10, refit=True, verbose=3) 
dt_cv.fit(scaled_X, y)
print(dt_cv.best_score_)
print(dt_cv.best_params_)

### LGB

In [None]:
kf10 = StratifiedKFold(n_splits = 10, shuffle = True)
lgb_param = {'max_depth': np.arange(2, 11),
            'n_estimators': np.arange(50, 201, 50),
            'lambda_l1': [0, 1, 1.5],
            'lambda_l2': [0, 1]}

LGB = lgb.LGBMClassifier()
lgb_cv = GridSearchCV(LGB, lgb_param, cv=kf10, refit=True, verbose=3) 
lgb_cv.fit(scaled_X, y)
print(lgb_cv.best_score_)
print(lgb_cv.best_params_)

# Final model: XGBoost in version 2

In [None]:
test = pd.read_csv('option_test_wolabel.csv')
test.head()

In [None]:
X_test = scaler.transform(test)
X_test

In [None]:
kb = XGBClassifier(colsample_bytree = 0.8, gamma = 1.5, max_depth = 4, 
                   min_child_weight = 1, subsample = 0.6)
kb.fit(scaled_X, y)
y_pred = kb.predict(X_test)

In [None]:
BS = pd.DataFrame(y_pred, columns = ['BS'])
BS.head()

In [None]:
BS.to_csv('BS_result.csv')