In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import neighbors
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.model_selection import cross_val_score, GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn import metrics
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

## i. Load Data

In [2]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0,0


In [3]:
train = train.drop(columns=['id'])

In [4]:
label = pd.DataFrame(train.label)
label.head()

Unnamed: 0,label
0,1
1,0
2,0
3,0
4,0


In [5]:
train = train.drop(columns=['label'])
train.head()

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0
1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0
2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0
3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0
4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0


In [6]:
test = pd.read_csv('test.csv')
test = test.drop(columns=['id'])
test.head()

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
0,2843.0,156.0,1358.52,730.78,637.85,127.06,94.82,1588.62,45,1,3256.0
1,437.0,137.0,509.43,268.05,243.07,390.86,98.24,1002.76,51,1,491.0
2,826.0,82.0,1232.22,493.42,744.08,516.28,320.15,2200.58,32,0,1381.0
3,861.0,50.0,1512.86,925.51,590.07,380.25,25.8,1929.1,50,0,1377.0
4,1160.0,157.0,890.42,403.91,489.53,266.92,87.63,1251.52,43,0,1844.0


## ii. Data Engineering

In [7]:
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
numerical_fea

['MO HLADR+ MFI (cells/ul)',
 'Neu CD64+MFI (cells/ul)',
 'CD3+T (cells/ul)',
 'CD8+T (cells/ul)',
 'CD4+T (cells/ul)',
 'NK (cells/ul)',
 'CD19+ (cells/ul)',
 'CD45+ (cells/ul)',
 'Age',
 'Sex 0M1F',
 'Mono CD64+MFI (cells/ul)']

In [8]:
category_fea = list(filter(lambda x: x not in numerical_fea,list(train.columns)))
category_fea

[]

In [9]:
# supply the dropped data.
train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].mean())
train.isnull().sum()

MO HLADR+ MFI (cells/ul)    0
Neu CD64+MFI (cells/ul)     0
CD3+T (cells/ul)            0
CD8+T (cells/ul)            0
CD4+T (cells/ul)            0
NK (cells/ul)               0
CD19+ (cells/ul)            0
CD45+ (cells/ul)            0
Age                         0
Sex 0M1F                    0
Mono CD64+MFI (cells/ul)    0
dtype: int64

### split the dataset

In [10]:
train_x,val_x,train_y,val_y = train_test_split(train,label,test_size=0.2,random_state=0)

In [11]:
train_x.head()

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
75,1306.0,114.0,1298.62,647.53,608.43,254.15,90.64,1648.75,32,0,1589.0
42,602.0,98.0,770.62,499.81,276.71,214.92,76.34,1071.42,49,1,1566.0
22,860.0,30.0,782.02,362.71,365.66,318.91,110.73,1215.6,41,1,727.0
6,1517.0,76.0,813.73,541.23,276.28,258.81,82.81,1179.27,42,0,1784.0
61,1047.0,76.0,1529.16,664.01,860.33,249.25,155.9,1999.74,34,0,1154.0


In [12]:
train_y.head()

Unnamed: 0,label
75,0
42,0
22,0
6,0
61,0


## XGBClassifier

In [13]:
def use_XGBClassifier(model, train_x, train_y, val_x, val_y):
    model.fit(train_x,train_y)
    y_pred = model.predict(val_x)
    acc = model.score(val_x, val_y)
    classification_report = metrics.classification_report(val_y,y_pred)
    confusion_matrix = metrics.confusion_matrix(val_y,y_pred)
    print(f"\033[1m@avg of accuracy: \033[0m \n{acc}")
    print(f"\033[1m@classification_report: \033[0m\n{classification_report}")
    print(f"\033[1m@confusion_matrix: \033[0m\n{confusion_matrix}")
    return None

def grid_search(model, param, train_x, train_y, cv=5):
    gsearch = GridSearchCV(
        estimator = model,
        param_grid = param,
        scoring='roc_auc',
        n_jobs=4,
        cv=cv)
    gsearch.fit(train_x, train_y)
    best_param = gsearch.best_params_
    print("----- grid search -----")
    print(f"The best param: {best_param}")
    return best_param

def get_XGBClassifier(param_dict):
    """
    param_dict: save params of the XGBClassifier
    """
    lr = param_dict.get('learning_rate', 0.1)
    n_estimators = param_dict.get('n_estimators', 140)
    max_depth = param_dict.get('max_depth', 5)
    min_child_weight = param_dict.get('min_child_weight', 1)
    gamma = param_dict.get('gamma', 0)
    subsample = param_dict.get('subsample', 0.8)
    colsample_bytree = param_dict.get('colsample_bytree', 0.8)
    nthread = param_dict.get('nthread', 4)
    scale_pos_weight = param_dict.get('scale_pos_weight', 1)
    
    model = xgb.XGBClassifier(
        learning_rate =lr,
        n_estimators=n_estimators, 
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        gamma=gamma,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective= 'binary:logistic',
        nthread=nthread,
        scale_pos_weight=scale_pos_weight,
        eval_metric='logloss',
        seed=27)

    return model

In [14]:
param_dict = dict()
print("---- baseline -----")
model_baseline = get_XGBClassifier(param_dict=param_dict)
use_XGBClassifier(model=model_baseline, train_x=train_x, train_y=train_y, val_x=val_x, val_y=val_y)

---- baseline -----


  return f(*args, **kwargs)


[1m@avg of accuracy: [0m 
0.8333333333333334
[1m@classification_report: [0m
              precision    recall  f1-score   support

           0       0.81      1.00      0.90        13
           1       1.00      0.40      0.57         5

    accuracy                           0.83        18
   macro avg       0.91      0.70      0.73        18
weighted avg       0.86      0.83      0.81        18

[1m@confusion_matrix: [0m
[[13  0]
 [ 3  2]]


In [15]:
# param optimization
param_test1 = {
'max_depth':[i for i in range(3,10,2)],
'min_child_weight':[i for i in range(1,6,2)]
}
param_after_1 = grid_search(model=model_baseline, param=param_test1, train_x=train_x, train_y=train_y)
param_dict.update(param_after_1)
print(param_dict)

print("----- max_depth and min_child_weight optimization-----")
model_opt_1 = get_XGBClassifier(param_dict)
use_XGBClassifier(model=model_opt_1, train_x=train_x, train_y=train_y, val_x=val_x, val_y=val_y)

----- grid search -----
The best param: {'max_depth': 3, 'min_child_weight': 3}
{'max_depth': 3, 'min_child_weight': 3}
----- max_depth and min_child_weight optimization-----
[1m@avg of accuracy: [0m 
0.8888888888888888
[1m@classification_report: [0m
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        13
           1       1.00      0.60      0.75         5

    accuracy                           0.89        18
   macro avg       0.93      0.80      0.84        18
weighted avg       0.90      0.89      0.88        18

[1m@confusion_matrix: [0m
[[13  0]
 [ 2  3]]


  return f(*args, **kwargs)


In [16]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
param_after_3 = grid_search(model=model_opt_1, param=param_test3, train_x=train_x, train_y=train_y)
param_dict.update(param_after_3)
print(param_dict)

print("----- gamma optimization-----")
model_opt_3 = get_XGBClassifier(param_dict)
use_XGBClassifier(model=model_opt_3, train_x=train_x, train_y=train_y, val_x=val_x, val_y=val_y)

----- grid search -----
The best param: {'gamma': 0.0}
{'max_depth': 3, 'min_child_weight': 3, 'gamma': 0.0}
----- gamma optimization-----
[1m@avg of accuracy: [0m 
0.8888888888888888
[1m@classification_report: [0m
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        13
           1       1.00      0.60      0.75         5

    accuracy                           0.89        18
   macro avg       0.93      0.80      0.84        18
weighted avg       0.90      0.89      0.88        18

[1m@confusion_matrix: [0m
[[13  0]
 [ 2  3]]


  return f(*args, **kwargs)


In [17]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
param_after_4 = grid_search(model=model_opt_3, param=param_test4, train_x=train_x, train_y=train_y)
param_dict.update(param_after_4)
print(param_dict)

print("----- gamma optimization-----")
model_opt_4 = get_XGBClassifier(param_dict)
use_XGBClassifier(model=model_opt_4, train_x=train_x, train_y=train_y, val_x=val_x, val_y=val_y)

----- grid search -----
The best param: {'colsample_bytree': 0.6, 'subsample': 0.9}
{'max_depth': 3, 'min_child_weight': 3, 'gamma': 0.0, 'colsample_bytree': 0.6, 'subsample': 0.9}
----- gamma optimization-----
[1m@avg of accuracy: [0m 
0.8888888888888888
[1m@classification_report: [0m
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        13
           1       1.00      0.60      0.75         5

    accuracy                           0.89        18
   macro avg       0.93      0.80      0.84        18
weighted avg       0.90      0.89      0.88        18

[1m@confusion_matrix: [0m
[[13  0]
 [ 2  3]]


  return f(*args, **kwargs)


In [18]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
param_after_6 = grid_search(model=model_opt_4, param=param_test6, train_x=train_x, train_y=train_y)
param_dict.update(param_after_6)
print(param_dict)

print("----- gamma optimization-----")
model_opt_6 = get_XGBClassifier(param_dict)
use_XGBClassifier(model=model_opt_6, train_x=train_x, train_y=train_y, val_x=val_x, val_y=val_y)

----- grid search -----
The best param: {'reg_alpha': 1e-05}
{'max_depth': 3, 'min_child_weight': 3, 'gamma': 0.0, 'colsample_bytree': 0.6, 'subsample': 0.9, 'reg_alpha': 1e-05}
----- gamma optimization-----
[1m@avg of accuracy: [0m 
0.8888888888888888
[1m@classification_report: [0m
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        13
           1       1.00      0.60      0.75         5

    accuracy                           0.89        18
   macro avg       0.93      0.80      0.84        18
weighted avg       0.90      0.89      0.88        18

[1m@confusion_matrix: [0m
[[13  0]
 [ 2  3]]


  return f(*args, **kwargs)


In [20]:
best_model = get_XGBClassifier(param_dict)
best_model.fit(train_x,train_y)

pred1 = best_model.predict(train_x)
accuracy1 = metrics.accuracy_score(train_y,pred1)
print('acc on train: %.4f'%accuracy1)

pred2 = best_model.predict(val_x)
accuracy2 = metrics.accuracy_score(val_y,pred2)
print('acc on val: %.4f'%accuracy2)

acc on train: 0.9420
acc on val: 0.8889


In [22]:
# save result
result = best_model.predict(test)
result = pd.DataFrame(result)
result.columns = list('a')
result = result.rename(columns={'a':'label'})
result['id'] = result.index
result = result[['id','label']]
result.to_csv("result/result.csv", index=False)
result.head()

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


### Other model

In [23]:
lr_model = LogisticRegression()
lr_model.fit(train_x,train_y)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [24]:
pred1 = lr_model.predict(train_x)
accuracy1 = accuracy_score(train_y,pred1)
print('acc on train: %.4f'%accuracy1)

pred2 = lr_model.predict(val_x)
accuracy2 = accuracy_score(val_y,pred2)
print('acc on val: %.4f'%accuracy2)

acc on train: 0.9710
acc on val: 0.8889


In [25]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_x,train_y)

pred1 = clf.predict(train_x)
accuracy1 = accuracy_score(train_y,pred1)
print('acc on train: %.4f'%accuracy1)
pred_y_val = clf.predict(val_x)
accuracy = accuracy_score(val_y, pred_y_val)
print("acc on val: %.4f" %accuracy)

acc on train: 0.9710
acc on val: 0.8333


  
