In [5]:
# data process
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# models
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

# metric
from sklearn.metrics import roc_auc_score

# save & load
import pickle
import yaml

RANDOM_SEED = 13

# Columns Description:

- `Customer ID`: Уникальный идентификатор для каждого клиента
- `Surname`: Фамилия клиента
- `Credit` Score: Числовое значение, представляющее кредитный рейтинг клиента
- `Geography`: Страна, в которой проживает клиент (Франция, Испания или Германия)
- `Gender`: Пол клиента (Мужчина или Женщина)
- `Age`: Возраст клиента
- `Tenure`: Количество лет, в течение которых клиент обслуживается в банке
- `Balance`: Баланс на счете клиента
- `NumOfProducts`: Количество банковских продуктов, которыми пользуется клиент (например, сберегательный счет, кредитная карта)
- `HasCrCard`: Есть ли у клиента кредитная карта (1 = да, 0 = нет)
- `IsActiveMember`: Является ли клиент активным членом (1 = да, 0 = нет)
- `EstimatedSalary`: Предполагаемая зарплата клиента
- `Exited`: Покинул ли клиент банк (1 = да, 0 = нет)

In [6]:
df = pd.read_csv('../data/train.csv', index_col=0)

In [7]:
display(df)

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165034 entries, 0 to 165033
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   CustomerId       165034 non-null  int64  
 1   Surname          165034 non-null  object 
 2   CreditScore      165034 non-null  int64  
 3   Geography        165034 non-null  object 
 4   Gender           165034 non-null  object 
 5   Age              165034 non-null  float64
 6   Tenure           165034 non-null  int64  
 7   Balance          165034 non-null  float64
 8   NumOfProducts    165034 non-null  int64  
 9   HasCrCard        165034 non-null  float64
 10  IsActiveMember   165034 non-null  float64
 11  EstimatedSalary  165034 non-null  float64
 12  Exited           165034 non-null  int64  
dtypes: float64(5), int64(5), object(3)
memory usage: 17.6+ MB


In [9]:
# mapping 'object' to 'int' (label encoding)
geography_mapping = {'France': 0, 'Spain': 1, 'Germany': 2}
gender_mapping = {'Male': 0, 'Female': 1}

df['Geography'] = df['Geography'].map(geography_mapping)
df['Gender'] = df['Gender'].map(gender_mapping)

In [10]:
train, test = train_test_split(df, test_size=0.25, random_state=RANDOM_SEED, shuffle=True, stratify=df['Exited'])

target = 'Exited'

features = [
    'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
    'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'
]

# Create Models

In [11]:
model1 = RandomForestClassifier(
    n_estimators=200, 
    criterion='gini', 
    max_depth=10,
    min_samples_split=10,
    random_state=RANDOM_SEED
)

model1.fit(
    X=train[features], y=train[target]
)

RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=200,
                       random_state=13)

In [12]:
roc_auc_score(test[target], model1.predict_proba(X=test[features])[:, -1])

0.8868771673540963

In [13]:
pd.DataFrame({
    'feature': model1.feature_names_in_, 
    'value': model1.feature_importances_}
).sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
3,Age,0.361596
6,NumOfProducts,0.315792
8,IsActiveMember,0.106024
5,Balance,0.06449
1,Geography,0.062629
2,Gender,0.0324
9,EstimatedSalary,0.022766
0,CreditScore,0.02188
4,Tenure,0.009305
7,HasCrCard,0.003116


In [14]:
features_upd = [
    'Age', 'NumOfProducts', 'IsActiveMember', 'Balance', 'Geography', 'Gender'
]

model2 = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=10,
    min_child_samples=5,
    random_state=RANDOM_SEED,
)

model2.fit(
    X=train[features_upd], y=train[target]
)

[LightGBM] [Info] Number of positive: 26191, number of negative: 97584
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 336
[LightGBM] [Info] Number of data points in the train set: 123775, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211602 -> initscore=-1.315298
[LightGBM] [Info] Start training from score -1.315298


LGBMClassifier(max_depth=10, min_child_samples=5, random_state=13)

In [15]:
roc_auc_score(test[target], model2.predict_proba(X=test[features_upd])[:, -1])

0.886819794986354

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123775 entries, 39356 to 58978
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   CustomerId       123775 non-null  int64  
 1   Surname          123775 non-null  object 
 2   CreditScore      123775 non-null  int64  
 3   Geography        123775 non-null  int64  
 4   Gender           123775 non-null  int64  
 5   Age              123775 non-null  float64
 6   Tenure           123775 non-null  int64  
 7   Balance          123775 non-null  float64
 8   NumOfProducts    123775 non-null  int64  
 9   HasCrCard        123775 non-null  float64
 10  IsActiveMember   123775 non-null  float64
 11  EstimatedSalary  123775 non-null  float64
 12  Exited           123775 non-null  int64  
dtypes: float64(5), int64(7), object(1)
memory usage: 13.2+ MB


In [16]:
pd.DataFrame({
    'feature': model2.feature_names_in_, 
    'value': model2.feature_importances_}
).sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
3,Balance,1140
0,Age,827
1,NumOfProducts,348
4,Geography,246
2,IsActiveMember,231
5,Gender,208


# Test

In [37]:
model2.predict_proba(X=np.array([57, 3, 1, 250000, 1, 0]).reshape(1, -1))

array([[0.11945435, 0.88054565]])

# Save

In [31]:
roc_auc = roc_auc_score(test[target], model2.predict_proba(X=test[features_upd])[:, -1])

metrics = {'roc_auc_score': float(roc_auc)}

In [21]:
borders = {
    'Age': {'min': int(train['Age'].min()), 'max': int(train['Age'].max())},
    'NumOfProducts': {'min': train['NumOfProducts'].min(), 'max': train['NumOfProducts'].max()},
    'IsActiveMember': {'Да': 1, 'Нет': 0},
    'Balance': {'min': train['Balance'].min(), 'max': train['Balance'].max()},
    'Geography': {'Франция': 0, 'Испания': 1, 'Германия': 2},
    'Gender': {'Мужчина': 0, 'Женщина': 1},
}

In [32]:
with open('../configs/metrics.yaml', 'w') as f:
    yaml.dump(metrics, f, encoding='UTF-8')

In [91]:
with open('../configs/feature_constraints.yaml', 'w') as f:
    yaml.dump(borders, f, encoding='UTF-8')

In [86]:
with open('../models/RF_model.pkl', 'wb') as f:
    pickle.dump(model1, f)

with open('../models/LGB_model.pkl', 'wb') as f:
    pickle.dump(model2, f)