Notebook này chạy trên local.

# Binary Classification with a Bank Churn Dataset

[Link_to_competition](https://www.kaggle.com/competitions/playground-series-s4e1)

## Exploring datasets

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [2]:
!kaggle competitions download -c playground-series-s4e1

playground-series-s4e1.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
from zipfile import ZipFile
with ZipFile('playground-series-s4e1.zip') as f:
    f.extractall('bank-churn-data')

In [4]:
import pandas as pd

In [5]:
raw_df = pd.read_csv('bank-churn-data/train.csv')
test_df = pd.read_csv('bank-churn-data/test.csv')
sub_df = pd.read_csv('bank-churn-data/sample_submission.csv')

In [6]:
raw_df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [7]:
raw_df.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [8]:
def view_categoricals_feature(cat_col: str):
    total = raw_df[cat_col].value_counts()
    zeros = raw_df[raw_df['Exited']==0][cat_col].value_counts()
    return zeros/total*100

In [9]:
view_categoricals_feature('Geography')

Geography
France     83.471846
Spain      82.782426
Germany    62.104837
Name: count, dtype: float64

In [10]:
view_categoricals_feature('Gender')

Gender
Male      84.094471
Female    72.031328
Name: count, dtype: float64

In [11]:
view_categoricals_feature('HasCrCard')

HasCrCard
1.0    79.356736
0.0    77.257056
Name: count, dtype: float64

In [12]:
view_categoricals_feature('IsActiveMember')

IsActiveMember
0.0    70.291368
1.0    87.465459
Name: count, dtype: float64

In [13]:
raw_df['Age'].describe()

count    165034.000000
mean         38.125888
std           8.867205
min          18.000000
25%          32.000000
50%          37.000000
75%          42.000000
max          92.000000
Name: Age, dtype: float64

In [14]:
raw_df['AgeGroup'] = pd.cut(raw_df['Age'],
                            bins=[17,30,60,100],
                            labels=['Adult','OldAdult','Old'])

In [15]:
test_df['AgeGroup'] = pd.cut(test_df['Age'],
                             bins=[17,30,60,100],
                             labels=['Adult','OldAdult','Old'])

In [16]:
view_categoricals_feature('AgeGroup')

AgeGroup
OldAdult    76.335046
Adult       91.735079
Old         68.575697
Name: count, dtype: float64

In [17]:
view_categoricals_feature('NumOfProducts')

NumOfProducts
2    93.957837
1    65.288081
3    11.748445
4    12.421053
Name: count, dtype: float64

In [18]:
view_categoricals_feature('Tenure')

Tenure
0     74.515678
1     77.386635
2     80.515378
3     77.089597
4     77.361285
5     77.993977
6     80.122614
7     81.240876
8     80.205479
9     78.789874
10    78.727365
Name: count, dtype: float64

## Preprocessing

In [19]:
numeric_cols = ['CreditScore','Tenure','Balance','NumOfProducts','EstimatedSalary']
categorical_cols = ['AgeGroup','Geography','Gender','HasCrCard','IsActiveMember']

In [20]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [21]:
scaler = MinMaxScaler().fit(raw_df[numeric_cols])

In [22]:
raw_df[numeric_cols] = scaler.transform(raw_df[numeric_cols])

In [23]:
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [24]:
raw_df[numeric_cols].describe()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,EstimatedSalary
count,165034.0,165034.0,165034.0,165034.0,165034.0
mean,0.612909,0.502035,0.221118,0.184818,0.56287
std,0.160207,0.280616,0.250371,0.182385,0.251488
min,0.0,0.0,0.0,0.0,0.0
25%,0.494,0.3,0.0,0.0,0.373166
50%,0.618,0.5,0.0,0.333333,0.589738
75%,0.72,0.7,0.478041,0.333333,0.775779
max,1.0,1.0,1.0,1.0,1.0


In [25]:
test_df[numeric_cols].describe()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,EstimatedSalary
count,110023.0,110023.0,110023.0,110023.0,110023.0
mean,0.613062,0.499664,0.220542,0.18444,0.561571
std,0.160631,0.280615,0.250255,0.181571,0.251409
min,0.0,0.0,0.0,0.0,0.0
25%,0.494,0.3,0.0,0.0,0.372179
50%,0.62,0.5,0.0,0.333333,0.58916
75%,0.72,0.7,0.478862,0.333333,0.773173
max,1.0,1.0,1.0,1.0,1.0


In [26]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(raw_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
encoded_df = pd.DataFrame(data=encoder.transform(raw_df[categorical_cols]), columns=encoded_cols)
encoded_df.index = raw_df.index
raw_df = pd.concat([raw_df, encoded_df], axis=1).copy()

In [27]:
encoded_df = pd.DataFrame(data=encoder.transform(test_df[categorical_cols]), columns=encoded_cols)
encoded_df.index = test_df.index
test_df = pd.concat([test_df, encoded_df], axis=1).copy()

In [28]:
inputs = raw_df[numeric_cols+encoded_cols]
targets = raw_df['Exited']

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets,
                                                                        test_size=0.2,
                                                                        random_state=42)

In [31]:
test_inputs = test_df[numeric_cols+encoded_cols]

## Training model

### ML Model

In [32]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [33]:
def test_params(**params):
    model = XGBClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [34]:
for i in [None, 0, 42, 1212]:
    train_acc, val_acc = test_params(n_jobs=-1, random_state=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test None: train_acc = 0.857499, val_acc = 0.839307
Test 0: train_acc = 0.857499, val_acc = 0.839307
Test 42: train_acc = 0.857499, val_acc = 0.839307
Test 1212: train_acc = 0.857499, val_acc = 0.839307


In [35]:
for i in [100,150,250,350,500]:
    train_acc, val_acc = test_params(n_jobs=-1, random_state=42,
                                     n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 100: train_acc = 0.857499, val_acc = 0.839307
Test 150: train_acc = 0.864603, val_acc = 0.837428
Test 250: train_acc = 0.876752, val_acc = 0.837277
Test 350: train_acc = 0.886455, val_acc = 0.834884
Test 500: train_acc = 0.898142, val_acc = 0.832854


In [36]:
for i in [3,4,5,6,9,12,15]:
    train_acc, val_acc = test_params(n_jobs=-1, random_state=42,
                                     n_estimators=150, max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 3: train_acc = 0.844388, val_acc = 0.842124
Test 4: train_acc = 0.847342, val_acc = 0.841458
Test 5: train_acc = 0.853416, val_acc = 0.839640
Test 6: train_acc = 0.864603, val_acc = 0.837428
Test 9: train_acc = 0.915449, val_acc = 0.831309
Test 12: train_acc = 0.974982, val_acc = 0.824855
Test 15: train_acc = 0.993994, val_acc = 0.822825


In [37]:
for i in [0.005,0.01,0.05,0.1,0.2,0.4]:
    train_acc, val_acc = test_params(n_jobs=-1, random_state=42,
                                     n_estimators=150, max_depth=4,
                                     learning_rate=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.005: train_acc = 0.803570, val_acc = 0.804526
Test 0.01: train_acc = 0.833875, val_acc = 0.834096
Test 0.05: train_acc = 0.841987, val_acc = 0.842124
Test 0.1: train_acc = 0.843835, val_acc = 0.841731
Test 0.2: train_acc = 0.845729, val_acc = 0.841731
Test 0.4: train_acc = 0.849228, val_acc = 0.841579


In [38]:
for i in [0.3,0.5,0.7,0.9,0.99]:
    train_acc, val_acc = test_params(n_jobs=-1, random_state=42,
                                     n_estimators=150, max_depth=4,
                                     learning_rate=0.1, subsample=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.3: train_acc = 0.843827, val_acc = 0.841518
Test 0.5: train_acc = 0.843949, val_acc = 0.842336
Test 0.7: train_acc = 0.843865, val_acc = 0.841973
Test 0.9: train_acc = 0.843850, val_acc = 0.841731
Test 0.99: train_acc = 0.843524, val_acc = 0.841397


In [39]:
xgb_model = XGBClassifier(n_jobs=-1, random_state=42,
                          n_estimators=150, max_depth=4,
                          learning_rate=0.1, subsample=0.7)

In [40]:
xgb_model.fit(train_inputs, train_targets)

In [41]:
xgb_model.score(val_inputs, val_targets)

0.8419729148362468

In [42]:
preds = xgb_model.predict(test_inputs)

In [100]:
preds = xgb_model.predict_proba(test_inputs)

In [103]:
preds[:,1]

array([0.02008939, 0.6231572 , 0.09258449, ..., 0.04302597, 0.40937373,
       0.28447258], dtype=float32)

In [44]:
len(test_inputs), len(preds)

(110023, 110023)

In [104]:
sub_df['Exited'] = preds[:,1]
sub_df.to_csv('bank-churn-data/sub2.csv',index=None)

In [110]:
from lightgbm import LGBMClassifier

In [111]:
def test_params(**params):
    model = LGBMClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [112]:
test_params()

(0.8457209510175949, 0.8417608386099918)

### Feed Forward NN Model

In [46]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [47]:
inputs.shape, targets.shape

((165034, 17), (165034,))

In [48]:
input_tensors = torch.tensor(inputs.values, dtype=torch.float32)
target_tensors = torch.tensor(targets.values, dtype=torch.float32)

In [49]:
input_tensors.shape, target_tensors.shape

(torch.Size([165034, 17]), torch.Size([165034]))

In [50]:
from torch.utils.data import TensorDataset, DataLoader

In [51]:
raw_ds = TensorDataset(input_tensors, target_tensors)
raw_ds[0]

(tensor([0.6360, 0.3000, 0.0000, 0.3333, 0.9073, 0.0000, 0.0000, 1.0000, 1.0000,
         0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 1.0000, 0.0000]),
 tensor(0.))

In [52]:
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x2bb00920c30>

In [53]:
val_size = int(len(raw_ds)*0.2)
train_size = len(raw_ds)-val_size
from torch.utils.data import random_split
train_ds, val_ds = random_split(raw_ds, [train_size, val_size])

In [54]:
batch_size = 128

In [55]:
train_dl = DataLoader(raw_ds, batch_size, shuffle=True)
val_dl = DataLoader(raw_ds, batch_size*2)

In [56]:
from sklearn.metrics import accuracy_score
import numpy as np

In [57]:
class BaseModel(nn.Module):
    def training_step(self, batch):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs[:,0])
        loss = F.binary_cross_entropy(probs, targets)
        return loss

    def validation_step(self, batch):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs[:,0])
        loss = F.binary_cross_entropy(probs, targets)
        preds = (probs > 0.5).int()
        acc = accuracy_score(targets, preds)
        return {'val_loss': loss.item(), 'val_acc': acc}

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = np.mean(batch_losses)
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = np.mean(batch_accs)
        return {'val_loss': epoch_loss, 'val_acc': epoch_acc}

    def epoch_end(self, epoch, result):
        print('Epoch {}, val_loss: {:4f}, val_acc: {:4f}'.format(epoch,
                                                                 result['val_loss'],
                                                                 result['val_acc']))

In [76]:
class FFWith4LayerModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(17, 64)
        self.linear2 = nn.Linear(64,32)
        self.linear3 = nn.Linear(32,1)
    
    def forward(self, xb):
        out = self.linear1(xb)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.relu(out)
        out = self.linear3(out)
        return out

In [82]:
ff_with4_model = FFWith4LayerModel()

In [83]:
def evaluate(model, val_dl):
    outputs = [model.validation_step(batch) for batch in val_dl]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_dl, val_dl, opt_func=torch.optim.SGD):
    history = []
    opt = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        for batch in train_dl:
            loss = model.training_step(batch)
            loss.backward()
            opt.step()
            opt.zero_grad()
        result = evaluate(model, val_dl)
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [84]:
history = [evaluate(ff_with4_model, val_dl)]
history

[{'val_loss': 0.74759558818137, 'val_acc': 0.21157439865481076}]

In [85]:
history += fit(4, 0.2, ff_with4_model, train_dl, val_dl)

Epoch 0, val_loss: 0.417004, val_acc: 0.813347
Epoch 1, val_loss: 0.372343, val_acc: 0.836612
Epoch 2, val_loss: 0.372295, val_acc: 0.835043
Epoch 3, val_loss: 0.364732, val_acc: 0.839259


In [86]:
history += fit(4, 0.1, ff_with4_model, train_dl, val_dl)

Epoch 0, val_loss: 0.364592, val_acc: 0.839183
Epoch 1, val_loss: 0.365489, val_acc: 0.836885
Epoch 2, val_loss: 0.362969, val_acc: 0.840512
Epoch 3, val_loss: 0.363777, val_acc: 0.838998


In [87]:
history += fit(4, 0.05, ff_with4_model, train_dl, val_dl)

Epoch 0, val_loss: 0.362209, val_acc: 0.840106
Epoch 1, val_loss: 0.362078, val_acc: 0.840533
Epoch 2, val_loss: 0.362181, val_acc: 0.840539
Epoch 3, val_loss: 0.362156, val_acc: 0.840745


In [88]:
test_ds = TensorDataset(torch.tensor(test_inputs.values, dtype=torch.float32))
test_dl = DataLoader(test_ds, batch_size*2)

In [105]:
def predict(test_dl, model):
    all_preds = []
    for batch in test_dl:
        inputs, = batch
        outputs = model(inputs)
        probs = torch.sigmoid(outputs[:,0])
        all_preds += list(probs.detach().numpy())
    return all_preds

In [106]:
preds = predict(test_dl, ff_with4_model)
preds

[0.021964546,
 0.6661228,
 0.07467656,
 0.43269598,
 0.5157162,
 0.04084793,
 0.017103769,
 0.029056933,
 0.3144927,
 0.015317729,
 0.24044013,
 0.038533427,
 0.013477389,
 0.3417657,
 0.58137757,
 0.024719004,
 0.092116594,
 0.70870906,
 0.035172928,
 0.19546402,
 0.044325173,
 0.2111648,
 0.45060343,
 0.019957386,
 0.7513113,
 0.27792042,
 0.70135796,
 0.3035775,
 0.1845761,
 0.34595495,
 0.04633685,
 0.045434188,
 0.22857207,
 0.28670517,
 0.08235702,
 0.02466992,
 0.6359342,
 0.09009592,
 0.21631628,
 0.74663675,
 0.66049707,
 0.2737705,
 0.127582,
 0.014770898,
 0.6571522,
 0.11067122,
 0.026267536,
 0.025350684,
 0.007871462,
 0.55401385,
 0.12791197,
 0.0138231255,
 0.015368891,
 0.44477633,
 0.04452595,
 0.37381294,
 0.22222887,
 0.014047094,
 0.029176062,
 0.3778199,
 0.015519303,
 0.034504123,
 0.24961875,
 0.3611247,
 0.014472051,
 0.02916497,
 0.5773946,
 0.049026307,
 0.15988624,
 0.041737154,
 0.482514,
 0.5301539,
 0.5060219,
 0.049410943,
 0.02825597,
 0.7078681,
 0.194

In [107]:
sub_df['Exited'] = preds
sub_df.to_csv('bank-churn-data/sub3.csv',index=None)

In [93]:
class FFWith5LayerModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(17, 64)
        self.linear2 = nn.Linear(64, 128)
        self.linear3 = nn.Linear(128, 64)
        self.linear4 = nn.Linear(64, 32)
        self.linear5 = nn.Linear(32, 1)
    
    def forward(self, xb):
        out = self.linear1(xb)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.relu(out)
        out = self.linear3(out)
        out = F.relu(out)
        out = self.linear4(out)
        out = F.relu(out)
        out = self.linear5(out)
        return out

In [94]:
ff_with5_model = FFWith5LayerModel()

In [96]:
history = [evaluate(ff_with5_model, val_dl)]
history

[{'val_loss': 0.6803122398465179, 'val_acc': 0.7884256013451892}]

In [97]:
history += fit(4, 0.2, ff_with5_model, train_dl, val_dl)

Epoch 0, val_loss: 0.516082, val_acc: 0.724184
Epoch 1, val_loss: 0.367781, val_acc: 0.836388
Epoch 2, val_loss: 0.371466, val_acc: 0.837735
Epoch 3, val_loss: 0.363518, val_acc: 0.839598


In [98]:
history += fit(4, 0.1, ff_with5_model, train_dl, val_dl)

Epoch 0, val_loss: 0.365369, val_acc: 0.838601
Epoch 1, val_loss: 0.363610, val_acc: 0.839625
Epoch 2, val_loss: 0.373678, val_acc: 0.831449
Epoch 3, val_loss: 0.364298, val_acc: 0.839613


In [99]:
history += fit(8, 0.02, ff_with5_model, train_dl, val_dl)

Epoch 0, val_loss: 0.361994, val_acc: 0.840857
Epoch 1, val_loss: 0.361846, val_acc: 0.840521
Epoch 2, val_loss: 0.361950, val_acc: 0.840597
Epoch 3, val_loss: 0.361878, val_acc: 0.840679
Epoch 4, val_loss: 0.361841, val_acc: 0.840966
Epoch 5, val_loss: 0.361747, val_acc: 0.840972
Epoch 6, val_loss: 0.362388, val_acc: 0.840815
Epoch 7, val_loss: 0.361725, val_acc: 0.840688


In [108]:
preds = predict(test_dl, ff_with5_model)
preds

[0.017368143,
 0.674518,
 0.080599196,
 0.45374426,
 0.51657325,
 0.039768185,
 0.012568688,
 0.025300419,
 0.30520895,
 0.009973136,
 0.23646533,
 0.033130467,
 0.010841757,
 0.32679722,
 0.56813955,
 0.025970966,
 0.086183846,
 0.71194667,
 0.026628783,
 0.2224446,
 0.040974785,
 0.17926924,
 0.45933867,
 0.013626237,
 0.7533729,
 0.22639117,
 0.7126588,
 0.30928406,
 0.2184384,
 0.34596804,
 0.04088821,
 0.041841406,
 0.24785058,
 0.22019461,
 0.07644065,
 0.02297487,
 0.6533794,
 0.09205368,
 0.20482522,
 0.74085563,
 0.6784341,
 0.23127358,
 0.11878197,
 0.011565358,
 0.668313,
 0.09888513,
 0.026797073,
 0.022352654,
 0.0069210296,
 0.5565443,
 0.13504162,
 0.012553825,
 0.016069314,
 0.4399982,
 0.033200264,
 0.28308898,
 0.21538335,
 0.013970107,
 0.027635204,
 0.39999512,
 0.015741594,
 0.020619899,
 0.26983118,
 0.34640625,
 0.013597836,
 0.027796905,
 0.57354426,
 0.05391004,
 0.1487234,
 0.03848402,
 0.4985635,
 0.5295539,
 0.52895826,
 0.0495957,
 0.016886147,
 0.71514046,

In [109]:
sub_df['Exited'] = preds
sub_df.to_csv('bank-churn-data/sub4.csv',index=None)