In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.preprocessing import LabelEncoder , StandardScaler , OneHotEncoder , OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [2]:
mlflow.set_tracking_uri("http://mlflow_server:5000")
mlflow.autolog()
mlflow.set_experiment("bank_churn")
mlflow.set_tag('mlflow.runName', 'bank_chunr_v1')

2024/12/15 13:30:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



In [3]:
data = pd.read_csv('data/train.csv')

In [4]:
data = data.drop(['id','CustomerId','Surname'] , axis = 1)

In [5]:
category_features = ['Tenure', 'NumOfProducts','HasCrCard','IsActiveMember','Geography','Gender']

In [6]:
def convert_to_category(data,features):
    for i in features:
        data[i] = data[i].astype('category')

In [7]:
convert_to_category(data,category_features)

In [8]:
numberic_features = data.drop(category_features , axis = 1).columns
numberic_features = data[numberic_features].drop('Exited' , axis = 1).columns
numberic_features

Index(['CreditScore', 'Age', 'Balance', 'EstimatedSalary'], dtype='object')

In [9]:
balance_cut = pd.cut(data['Balance'] ,bins = [-1 , 50000 , 100000 ,150000, data['Balance'].max()],
                     labels = ['Low','Normal','High','Very High'])

In [10]:
exited_old_age = data.loc[(data.Age >= 40) & (data.Exited == 1)]['Age'] 
exited_young_age = data.loc[(data.Age < 40) & (data.Exited == 1)]['Age']

In [11]:
mean1, mean2 = np.mean(exited_old_age), np.mean(exited_young_age)
std1, std2 = np.std(exited_old_age.std(), ddof=1), np.std(exited_young_age.std(), ddof=1)
n1, n2 = len(exited_old_age), len(exited_young_age)

z = (mean1 - mean2) / np.sqrt((std1**2/n1) + (std2**2/n2))

# alpha = 0.05 2 phía
if z <= 1.645:
    result = "H0"
else:
    result = "H1"

print(f'Result: {result}')

Result: H1


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


#### P(exited) : Mean Old age people > Mean Young age people (alpha = 0.05)

In [12]:
x_train,x_test,y_train,y_test = train_test_split(data.drop('Exited', axis = 1),data['Exited'], 
                                test_size = 0.2, 
                                shuffle = True , random_state = 42)

print(x_train.shape , y_train.shape)
print(x_test.shape , y_test.shape)

(132027, 10) (132027,)
(33007, 10) (33007,)


In [13]:
transformer = ColumnTransformer(transformers=[
    ('num_transformer' , StandardScaler() , numberic_features),
    ('ord_transformer' , OrdinalEncoder() , ['Gender']),
    ('nom_transformer' , OneHotEncoder(drop = 'first') , ['Geography'])
] , remainder='passthrough')

x_train_new = transformer.fit_transform(x_train)
x_test_new = transformer.transform(x_test)
# data_test_new = transformer.transform(data_test)

In [14]:
# import xgboost as xgb

# xgboost = xgb.XGBClassifier(n_estimators=300, random_state=42)
# xgboost.fit(x_train_new, y_train)

In [15]:
# from sklearn.ensemble import RandomForestClassifier

# # Khởi tạo mô hình Random Forest
# rf = RandomForestClassifier(random_state=42)

# # Huấn luyện mô hình
# rf.fit(x_train_new, y_train)

In [27]:
from sklearn.metrics import classification_report , precision_score , recall_score , accuracy_score , f1_score

models = [rf]

def evalution(models ,x_test, y_test):
    for i in models:
        y_pred = i.predict(x_test)
        
        print(i)
        
        accuracy = accuracy_score(y_test,y_pred)
        print(accuracy)
        
        precision = precision_score(y_test,y_pred)
        print(precision)

        recall = recall_score(y_test,y_pred)
        print(recall)

        f1 = f1_score(y_test,y_pred)
        print(f1)

        result = classification_report(y_test,y_pred)
        print(result)


In [17]:
# evalution(models , x_test_new , y_test)

In [18]:
# data_test = data_test.drop(['id','CustomerId','Surname'], axis = 1)
# data_test_new = transformer.transform(data_test)

In [19]:
# y_pred_proba = xgboost.predict(data_test_new)

In [20]:
# submission_pd = pd.DataFrame({'id': data_test.iloc[:,0] , 'Excited': y_pred_proba})

# submission = submission_pd.to_csv('submission.csv' , index = False)

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
mlflow.end_run()
with mlflow.start_run():
    # Khởi tạo mô hình Random Forest
    rf = RandomForestClassifier(random_state=42)
    # Huấn luyện mô hình
    rf.fit(x_train_new, y_train)
    predictions = rf.predict(x_test_new)
    signature = infer_signature(x_test_new, predictions)
    mlflow.sklearn.log_model(rf, "model", signature=signature)
    print(f"Model saved in run {mlflow.active_run().info.run_uuid}")
# rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
# rf.fit(x_train_new, y_train)




🏃 View run likeable-vole-69 at: http://mlflow_server:5000/#/experiments/0/runs/27ebdac888a74458a2d13d176bae222c
🧪 View experiment at: http://mlflow_server:5000/#/experiments/0
Model saved in run 1b637062020d4da79e3c7866bfd6a3aa
🏃 View run rumbling-hawk-497 at: http://mlflow_server:5000/#/experiments/0/runs/1b637062020d4da79e3c7866bfd6a3aa
🧪 View experiment at: http://mlflow_server:5000/#/experiments/0


Model saved in run 27ebdac888a74458a2d13d176bae222c


In [35]:
logged_model = 'runs:/1b637062020d4da79e3c7866bfd6a3aa/model'
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
# Predict on a Pandas DataFrame.
pre = loaded_model.predict(x_test_new)

In [37]:
len(pre)

33007