In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb



for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


In [2]:
class Config:
    drop_col = ['id', 'CustomerId', 'Surname']
    random_state=42

In [3]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e1/sample_submission.csv')
                                
train_data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
# Data analysis
# id - can be deleted
# customerId - can be deleted
# surname - drop?
# creditScore - can just leave as-is i think
# geography - look into number of unique values (then decide on how to categorically encode after)
# Gender - turn to a binary
# Age - might not need anything
# Tenure - check unique values - might not need preprocessed
# Balance - n/a
# NumOfProducts - n/a
# HasCrCard - n/a
# isActiveMember
# EstimatedSalary
# 
# Exited - label we are trying to predict 

# Things to investigate ==========
# In terms of data cleaning. No null values or 
# Look more into customerId and name etc...

In [5]:
# # print(train_data.nunique())
# # print(train_data.isnull().sum())
# num_cols = len(train_data.columns)

# # Calculate number of subplot rows and columns
# num_rows = (num_cols + 1) // 2

# plt.figure(figsize=(10,10)) 
# for i, col in enumerate(train_data):
#     if pd.api.types.is_numeric_dtype(train_data[col]):
#         print(col)
#         plt.subplot(num_rows,2, i+1)
#         train_data[col].hist(bins=20)
#         plt.xlabel(col)
#         plt.ylabel('frequency')
#     else:
#         print(f'NON NUMERICAL: {col}')

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return x.drop(Config.drop_col, axis=1)
    
class Encoder(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        label_encoder = LabelEncoder()
        x['Gender'] = label_encoder.fit_transform(x['Gender'])
        return x
    
class OneHotEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = pd.get_dummies(x, columns=['Geography'])
        binary_cols = []
        for col in x:
            if pd.api.types.is_bool_dtype(x[col]):
                binary_cols.append(col)
        x[binary_cols] = x[binary_cols].astype(int)
        return x
    
class Scaler(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        cols_to_scale = ['Balance', 'EstimatedSalary', 'CreditScore']
        scaler = StandardScaler()
        x[cols_to_scale] = scaler.fit_transform(x[cols_to_scale])
        return x

In [7]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('NameDropper', NameDropper()),
    ('Encoder', Encoder()),
    ('OneHotEncoder', OneHotEncoder()),
    ('Scaler', Scaler())
])

train_data.head()

processed_data = pipe.fit_transform(train_data)
processed_data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,0.144135,1,33.0,3,-0.883163,2,1.0,0.0,1.369486,0,1,0,0
1,-0.367706,1,33.0,1,-0.883163,2,1.0,1.0,-1.254085,0,1,0,0
2,0.268974,1,40.0,10,-0.883163,2,1.0,0.0,1.437422,0,1,0,0
3,-0.941966,1,34.0,2,1.486918,1,1.0,1.0,-0.557018,0,1,0,0
4,0.743362,1,33.0,5,-0.883163,2,1.0,1.0,-1.93877,0,0,0,1


In [8]:
x = processed_data.drop(columns='Exited')
y = processed_data['Exited']

In [9]:
from sklearn.metrics import roc_auc_score

CV = KFold(n_splits=5, shuffle=True, random_state=Config.random_state)

model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=Config.random_state)

avg_auc_scores = []
for train_idx, val_idx in CV.split(x):
    x_train = x.iloc[train_idx]
    y_train = y.iloc[train_idx]
    x_val = x.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    model.fit(x_train, y_train)
    
    y_pred = model.predict_proba(x_val)[:,1]
    
    auc_score = roc_auc_score(y_val, y_pred)
    avg_auc_scores.append(auc_score)
print(np.mean(avg_auc_scores))

0.8865798645314144


In [10]:
processed_test_data = pipe.fit_transform(test_data)

test_y_pred = model.predict_proba(processed_test_data)[:,1]

print(test_y_pred.shape)

(110023,)


In [11]:
sample_submission['Exited'] = test_y_pred

sample_submission.to_csv('submission.csv',index=False)