In [None]:
#Same accross all Model Selection notebooks
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

#Hyperparameter search method, metric: "roc_auc"
from sklearn.model_selection import GridSearchCV
#metric
from sklearn.metrics import roc_auc_score as auc

#Different for different Model Selection notebooks
#preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#model
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#vvvvvvvvvvvvvv
model_name="KNeighborsClassifier"
#^^^^^^^^^^^^^^
#select the index of file to load


train_files=["DL_train.csv", #0
             "Morgan_train.csv", #1
             "Both_train.csv"] #2

test_files=["DL_test.csv" #0
            ,"Morgan_test.csv" #1
            ,"Both_test.csv"] #2

dataset_names=["DL", #0
               "Morgan", #1
               "Both"] #2

#vvvvvvvvvvvvvv
data_index=1
#^^^^^^^^^^^^^^

norm_type = [StandardScaler(), #0
             MinMaxScaler()]  #1
norm_names=["Standard-Scaler","Min-Max"]



#vvvvvvvvvvvvvv
norm_index=0
#^^^^^^^^^^^^^^

models=[
    KNeighborsClassifier,#0
    
]

model_calls=[m() for m in models]


#vvvvvvvvvvvvvv
model_index=0
#^^^^^^^^^^^^^^

#vvvvvvvvvvvvvv
param_grid = {
    "model__n_neighbors":[1,2]
}
#^^^^^^^^^^^^^^



# 1. Import the train and test datasets *

In [None]:


dataset_name=dataset_names[data_index]
norm_name=norm_names[norm_index]
#check dataset if there is a index col, id there is include the argument index_col=0
x_train = pd.read_csv(train_files[data_index],index_col=0)
y_train = x_train.ACTIVE
x_train.drop(["ACTIVE"],axis=1,inplace=True)
x_test = pd.read_csv(test_files[data_index],index_col=0) 

# 2. Setup Pipeline *

In [None]:
#Step 1: Normalise the data

#Step 2: Model fit


#Step 3: Pipeline
pipeline=Pipeline(steps=[("norm",norm_type[norm_index]),("model",model_calls[model_index])])

# 3. Setup Parameter Grid *

In [None]:
param_grid

{'model__n_neighbors': [1, 2]}

# 4. Setup GridSearch

In [None]:
score_metric = "roc_auc"
search = GridSearchCV(pipeline,param_grid,
                      n_jobs=-1,
                      cv=10,
                      scoring=score_metric,
                     )


In [None]:
search.fit(x_train,y_train)

In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.613):
{'model__n_neighbors': 2}


In [None]:
search_res = pd.DataFrame(search.cv_results_)
search_res = search_res.sort_values(by=['rank_test_score'])
search_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,1.212525,0.159372,43.175926,11.648849,2,{'model__n_neighbors': 2},0.620648,0.60356,0.617228,0.603787,0.591474,0.606271,0.645974,0.602952,0.620257,0.620391,0.613254,0.014331,1
0,1.582617,0.078955,49.625169,1.810173,1,{'model__n_neighbors': 1},0.591978,0.583405,0.586101,0.586522,0.571618,0.589088,0.622961,0.571035,0.591752,0.591946,0.588641,0.013594,2


# 5.Estimation of Model Performance based on CV in GridSearch

In [None]:
estimated_auc = float(search_res.query("rank_test_score == 1").mean_test_score)

# 6.Best Hyperparameter *

In [None]:
parms = search_res.query("rank_test_score == 1").params

In [None]:
parms

1    {'model__n_neighbors': 2}
Name: params, dtype: object

In [None]:
parms.values

array([{'model__n_neighbors': 2}], dtype=object)

In [None]:
n_neighbors = parms.values[0]["model__n_neighbors"]
n_neighbors

2

# 7.Train model on full train dataset

In [None]:
full_model = models[model_index](n_neighbors=n_neighbors)

In [None]:
full_model.fit(x_train,y_train)

# 8.Prediction score for test dataset

In [None]:
pred_test = pd.DataFrame(full_model.predict_proba(x_test))[1]


In [None]:
pred_test.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: 1, dtype: float64

# 9.Export result

In [None]:
result = pd.DataFrame([estimated_auc,])

In [None]:
result.head()

Unnamed: 0,0
0,0.613254


In [None]:
result = pd.concat([result,pred_test])

In [None]:
result.head()

Unnamed: 0,0
0,0.613254
0,0.0
1,0.0
2,0.0
3,0.0


In [None]:
result.to_csv(f"{model_name}_{norm_name}_{dataset_name}_Result.csv")