In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mutual_info_score
from sklearn.metrics import roc_curve


import pickle

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
#!pip install xgboost
import xgboost as xgb

In [2]:
df = pd.read_csv("data/bank_churn_data.csv")
del df['customer_id']

In [3]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = df_train.churn.values
y_test = df_test.churn.values
y_val = df_val.churn.values
y_full_train = df_full_train.churn.values

del df_train['churn']
del df_test['churn']
del df_val['churn']
del df_full_train['churn']

In [4]:
xgb_params = {
    'eta': 0.01, 
    'max_depth': 8,
    'min_child_weight': 30,
     
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
 
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

In [5]:
def train(df, y, params):
    df = df_full_train.to_dict(orient='records')
 
    dv = DictVectorizer(sparse=False)
    X_full_train = dv.fit_transform(df)

    feature_names = list(dv.get_feature_names_out())

    feature_names = list(dv.get_feature_names_out())
    dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train,
                    feature_names=feature_names)
 
    model = xgb.train(params, dfulltrain, num_boost_round=200,
                  verbose_eval=5,)

    return (dv, model)
    

In [6]:
def valid(model, dv, df, y):
    dicts_test = df.to_dict(orient='records')
    X_test = dv.transform(dicts_test)

    feature_names = list(dv.get_feature_names_out())
    dtest = xgb.DMatrix(X_test, feature_names=feature_names)

    y_pred = model.predict(dtest)
    return roc_auc_score(y, y_pred)

    

In [7]:
### save model
dv, model = train(df_full_train, y_train, xgb_params)

In [16]:
valid(model=model, dv=dv, df=df_val, y=y_val)

0.86801261829653

In [9]:
valid(model=model, dv=dv, df=df_test, y=y_test)

0.8743004826878491

In [10]:
## Save the model
output_file = 'xboos_model.bin'
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

print(f"The model is save to {output_file}")

The model is save to xboos_model.bin


In [23]:
custumer = df_full_train.to_dict(orient='records')[1]
custumer
pd.json_normalize([custumer])
custumer

{'credit_score': 626,
 'country': 'France',
 'gender': 'Female',
 'age': 29,
 'tenure': 4,
 'balance': 105767.28,
 'products_number': 2,
 'credit_card': 0,
 'active_member': 0,
 'estimated_salary': 41104.82}

In [17]:
X  = df_test.head(1)

In [18]:
X

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,550,France,Male,47,2,0.0,2,1,1,97057.28


In [13]:
def predict(df, model, dv):
    X  = df
    X_try_dict = X.to_dict(orient='records')
    X_try = dv.transform(X_try_dict)
    feature_names = list(dv.get_feature_names_out())
     
    feature_names
    xtest = xgb.DMatrix(X_try, feature_names=feature_names)
    return model.predict(xtest)[0]

In [14]:
predict(df=X, model=model, dv=dv)

0.0942261