In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings;warnings.filterwarnings('ignore')

In [2]:
from xgboost import XGBClassifier
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import set_config
set_config(display='diagram')

In [3]:
# set seed for reproducibility
SEED = 30
np.random.seed(SEED)

In [4]:
train = pd.read_csv('../data/train.csv').drop(['id'], axis=1)
test = pd.read_csv('../data/test.csv').drop(['id'], axis=1)
print(f'Train data shape: {train.shape}, Test data shape: {test.shape}')

Train data shape: (28322, 35), Test data shape: (85065, 34)


In [5]:
df = pd.concat([train, test])
df.head()

Unnamed: 0,gender,s11,s12,s13,s16,s17,s18,s48,s52,s53,...,n7,n8,n9,n10,n11,n12,n13,n14,n15,label
0,M,Y,N,1,D,D,B,0,1,,...,-9.126056,1.732291,3.698504,4.804517,1.544484,0,0,0.63122,5,0.0
1,M,Y,Y,1,D,D,B,1,1,,...,-9.098287,1.505885,6.791357,6.110416,1.712354,0,0,0.392746,3,1.0
2,M,Y,Y,1,D,D,B,0,1,,...,-9.234894,1.503828,4.109685,3.953226,1.80426,0,0,0.222537,2,0.0
3,F,Y,Y,1,D,D,B,0,1,,...,-9.378025,1.485863,7.265876,4.559419,1.537645,0,0,0.154409,4,0.0
4,M,N,Y,1,B,D,D,1,l,,...,-9.261962,1.61921,3.737647,4.052003,1.637831,0,1,0.73756,1,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113387 entries, 0 to 85064
Data columns (total 35 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   gender  113387 non-null  object 
 1   s11     113387 non-null  object 
 2   s12     113387 non-null  object 
 3   s13     113387 non-null  int64  
 4   s16     113387 non-null  object 
 5   s17     113387 non-null  object 
 6   s18     113387 non-null  object 
 7   s48     113387 non-null  int64  
 8   s52     113387 non-null  object 
 9   s53     113387 non-null  object 
 10  s54     10371 non-null   object 
 11  s55     12627 non-null   object 
 12  s56     0 non-null       float64
 13  s57     0 non-null       float64
 14  s58     113387 non-null  object 
 15  s59     0 non-null       float64
 16  s69     113387 non-null  object 
 17  s70     113387 non-null  object 
 18  s71     113387 non-null  object 
 19  n1      113387 non-null  float64
 20  n2      113387 non-null  float64
 21  n3      113

In [7]:
dummy = ['s12','s13','s53','s58','n4','n11','n12','n13','n14','n15','s54','s55','s56','s57','s59']
df.drop(dummy, axis=1, inplace=True)

In [8]:
df['s48'] = df['s48'].astype(object)

# Train test split

In [9]:
dtrain = df[df['label'].notna()]
dtest = df[df['label'].isna()]

X = dtrain.drop('label', axis=1)
y = dtrain['label']

test = dtest.drop('label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True)

# Get numerical & categorical columns

In [10]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(df)
numerical_columns.remove('label')
categorical_columns = categorical_columns_selector(df)
print(f'Numerical: {numerical_columns} \nCategorical: {categorical_columns}')

Numerical: ['n1', 'n2', 'n3', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10'] 
Categorical: ['gender', 's11', 's16', 's17', 's18', 's48', 's52', 's69', 's70', 's71']


In [11]:
categorical_preprocessor = OrdinalEncoder()
numerical_preprocessor = StandardScaler()

In [12]:
preprocessor = ColumnTransformer([
    ('label-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [13]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.02, 0.05],
        'n_estimators': [800, 1000, 1500]
}
best_params = {
        'min_child_weight': [5],
        'gamma': [0.5],
        'subsample': [0.6],
        'colsample_bytree': [0.6],
        'max_depth': [5],
        'learning_rate': [0.01]    
}

In [14]:
cv = KFold(n_splits=2, random_state=SEED, shuffle=True)

In [15]:
xgb = XGBClassifier(random_state=SEED,learning_rate=0.02, n_estimators=1000, objective='binary:logistic', nthread=6, tree_method='gpu_hist', eval_metric='auc')
random_search = GridSearchCV(xgb, param_grid=best_params, scoring='roc_auc', n_jobs=8, cv=cv, verbose=10)

In [16]:
model = make_pipeline(preprocessor, random_search)

In [17]:
model.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [18]:
pred = model.predict(X_test)

In [19]:
model.score(X_test, y_test)

0.8707785916987185

In [20]:
roc_auc_score(y_test, pred)

0.7245736482749016

In [21]:
testid = pd.read_csv('../data/test.csv')
test_pred = model.predict(test)
submission = pd.concat([testid['id'],pd.Series(test_pred)],axis=1)
submission.columns = ['id', 'label']
submission.to_csv('submission.csv', index=False)