In [113]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime as dt

from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE

# Read Data

In [2]:
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')

In [3]:
df_payments[(df_payments['user'].isna())&(df_payments['credits']>=500)]
df_payments[(df_payments['user'].isna())&(df_payments['credits']<500)]
df_payments[(df_payments['user'].isna())&(df_payments['credits'].isna())]

df_payments[df_payments['id'].isnull()]

Unnamed: 0,id,created_at,changed_at,user,batch,credits,state,type
1424,,Anna,Prchalová,,1299589028/3030,,,


In [4]:
df_atlas.columns

Index(['user_id', 'all_intro-2022', 'atlas_vzorek-2023', 'atlas_vzorek-2024',
       'all_intro-trendaro_panel_all', 'sex-woman', 'sex-man', 'age-15_17',
       'age-18_24', 'age-25_34',
       ...
       'check-tv', 'check-radio', 'check-household', 'check-overdraft',
       'check-naramek', 'isic2_age-21_26', 'isic2_age-15_20', 'isic_age-15_17',
       'isic_age-18_22', 'isic_age-23_26'],
      dtype='object', length=1121)

In [80]:
df_credits

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
0,STUD66006,25,1,1,0,Peníze
1,STUD22095,51,1,1,0,Peníze
2,STUD77411,0,1,1,1,Peníze
3,STUD56329,0,1,0,0,
4,STUD23516,30,1,1,0,Peníze
...,...,...,...,...,...,...
89906,STUD54678,789,1,1,0,Peníze
89907,STUD43709,0,0,1,0,
89908,STUD21881,0,0,0,0,
89909,STUD72037,16,1,1,0,Lékaři bez hranic


# Negative Values investigation & Chekcing other things

In [5]:
df_c_negative = df_credits[df_credits['credits']<0]
df_credits[df_credits.user.isin(df_c_negative.user) == True]

# IDK what I'm doing with payments
df_payments[df_payments.user.isin(df_c_negative.user)==True].sort_values(['user','created_at'])
df_payments.state.unique()
df_payments[df_payments.user=='STUD54678']
df_credits[df_credits.user=='STUD54678']

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
89906,STUD54678,789,1,1,0,Peníze


In [6]:
df_c_negative # from df_credits

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
15877,STUD26214,-416,1,1,0,Peníze
28959,STUD16141,-10,1,1,0,Peníze
71540,STUD91415,-452,1,1,0,Peníze
72659,STUD99479,-441,1,1,0,Peníze
73458,STUD10440,-4,1,1,0,Peníze


In [7]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30087 entries, 0 to 30086
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          30086 non-null  float64
 1   created_at  30087 non-null  object 
 2   changed_at  30087 non-null  object 
 3   user        27652 non-null  object 
 4   batch       27502 non-null  object 
 5   credits     30086 non-null  float64
 6   state       30086 non-null  object 
 7   type        30086 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.8+ MB


# Data Cleaning

In [8]:
##### In the dataset we found out that in payments dataset values that are being cancelled has creadits = 0 and NaN user having credit 430 and still being PAID. However, that user doesn't have user_id

df_payments[df_payments['credits']<500].groupby('state').size()

state
CANCELLED    2000
PAID            1
dtype: int64

In [9]:
df_credits_cleaned = df_credits[df_credits['credits']>0]


"""
df_payments_cleaned = df_payments[(df_payments['user'].isna())&(df_payments['credits']<500)]
df_payments_cleaned = df_payments[(df_payments['user'].isna())&(df_payments['credits']>=500)]
df_payments_cleaned[(df_payments['user'].isna())&(df_payments['credits'].isna())]

-> everything cane be done using: df_payments_cleaned = df_payments[df_payments['user'].notna()]

"""

df_payments_cleaned = df_payments[df_payments['user'].notna()] # we threw out from payments 2345 observations


In [10]:
df_payments_cleaned

Unnamed: 0,id,created_at,changed_at,user,batch,credits,state,type
0,30116.0,2025-04-07 10:32:05.073604+00:00,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,MONEY
1,30115.0,2025-04-07 10:30:09.304166+00:00,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,MONEY
2,30114.0,2025-04-07 09:57:04.343935+00:00,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,MONEY
3,30113.0,2025-04-07 09:15:39.069868+00:00,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,MONEY
4,30112.0,2025-04-07 08:28:47.838506+00:00,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,MONEY
...,...,...,...,...,...,...,...,...
30080,36.0,2017-08-11 14:55:18.396493+00:00,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,MONEY
30081,35.0,2017-08-11 14:54:35.456650+00:00,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,MONEY
30082,34.0,2017-08-11 14:06:49.193910+00:00,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,MONEY
30085,31.0,2017-08-10 21:58:35.586238+00:00,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,MEDICINS_SANS_FRONTIERS


# What is core table for merge? - Payments // Can't I just use inner join? Therefore I dont' have to work with nul values - I don't want them in model anyway

In [11]:
#I have an assumption that all users should be in Atlas Cechu. However, after cheking my assumption it tourns out not to be true. 

# Convert the boolean series to a DataFrame with a column name
atlas_in_credits = df_credits_cleaned.user.isin(df_atlas.user_id).rename('User in Atlas')
atlas_in_payments = df_payments_cleaned.user.isin(df_atlas.user_id)

# Plotting
fig_ac = px.bar(atlas_in_credits.value_counts(), title='How many users from Atlas Cechu are in the credits table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})
fig_ap = px.bar(atlas_in_payments.value_counts(), title='How many users from Atlas Cechu are in the payments table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})

fig_ac.show()
fig_ap.show()

In [12]:
df_atlas.user_id.isin(df_credits_cleaned.user).value_counts()

user_id
True     40672
False     2861
Name: count, dtype: int64

In [13]:
df_atlas.user_id.isin(df_payments_cleaned.user).value_counts()

user_id
False    34824
True      8709
Name: count, dtype: int64

Since we want to find out when who pulls what, it makes sense to use the payments table and connect data to it - even though there are users in payments tables that are not in Atlas Cechu

In [88]:
#df_merge_credits = pd.merge(df_payments_cleaned, df_credits_cleaned, how="inner", left_on='user', right_on='user')
#df_merge_credits[df_merge_credits.user=='STUD92308']

"""
I don't find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions
"""

'\nI don\'t find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions\n'

In [14]:
df_merge_full = pd.merge(df_payments_cleaned, df_atlas, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})


# df_merge_full.to_csv('../data_output/df_merge_full.csv')


df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])

df_merge_full['day'] = df_merge_full['created_at'].dt.day
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year
df_merge_full['hour'] = df_merge_full['created_at'].dt.hour
df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday


df_merge_full = df_merge_full.drop(columns=['created_at'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [15]:
df_merge_full

Unnamed: 0,id,changed_at,user,batch,credits,state,user_id,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,...,isic_age-23_26,type_HANDIPET,type_MEDICINS_SANS_FRONTIERS,type_MONEY,type_TRENDARO,day,month,year,hour,weekday
0,30116.0,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,STUD83031,1,0,0,...,0,0,0,1,0,7,4,2025,10,0
1,30115.0,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,STUD356,1,0,1,...,0,0,0,1,0,7,4,2025,10,0
2,30114.0,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,STUD25955,1,0,0,...,0,0,0,1,0,7,4,2025,9,0
3,30113.0,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,STUD45618,1,0,0,...,0,0,0,1,0,7,4,2025,9,0
4,30112.0,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,STUD44458,1,0,0,...,0,0,0,1,0,7,4,2025,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26874,36.0,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,STUD49275,1,0,0,...,0,0,0,1,0,11,8,2017,14,4
26875,35.0,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,STUD53465,1,0,0,...,0,0,0,1,0,11,8,2017,14,4
26876,34.0,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,STUD29324,1,0,1,...,0,0,0,1,0,11,8,2017,14,4
26877,31.0,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,STUD92308,1,1,0,...,0,0,1,0,0,10,8,2017,21,3


In [166]:
# split dataset into three categories
def get_split_values(df_merge_full, value:int):
    col_name = f'credits_{value}+'
    df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user', 'batch', 'state', 'user_id'])
    df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)
    df_modeling = df_modeling.drop(columns=['credits'])

    fig = px.bar(pd.DataFrame(df_modeling[col_name].value_counts().reset_index()), x=col_name, y='count', title='Credits Category Distribution')
    fig.show()
    
    df_modeling = df_modeling[df_modeling[col_name].isnull()==False] # clean data from null values
    return df_modeling, col_name

In [176]:
df_modeling, colname = get_split_values(df_merge_full, 660)
df_modeling


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Unnamed: 0,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,all_intro-trendaro_panel_all,sex-woman,sex-man,age-15_17,age-18_24,age-25_34,age-35_44,...,type_HANDIPET,type_MEDICINS_SANS_FRONTIERS,type_MONEY,type_TRENDARO,day,month,year,hour,weekday,credits_660+
0,1,0,0,1,1,0,0,0,0,1,...,0,0,1,0,7,4,2025,10,0,0
1,1,0,1,1,1,0,0,0,0,0,...,0,0,1,0,7,4,2025,10,0,0
2,1,0,0,1,1,0,0,0,0,1,...,0,0,1,0,7,4,2025,9,0,0
3,1,0,0,1,1,0,0,0,0,0,...,0,0,1,0,7,4,2025,9,0,0
4,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,7,4,2025,8,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26874,1,0,0,1,0,1,0,0,1,0,...,0,0,1,0,11,8,2017,14,4,0
26875,1,0,0,1,0,1,0,0,0,1,...,0,0,1,0,11,8,2017,14,4,0
26876,1,0,1,1,0,1,0,0,0,1,...,0,0,1,0,11,8,2017,14,4,0
26877,1,1,0,1,0,1,0,0,1,0,...,0,1,0,0,10,8,2017,21,3,0


# Modeling

In [178]:
y = df_modeling[colname]
X = df_modeling.drop(columns=[colname])

def split_data(X, y, test_size=0.2, stratify=True, random_state=42):
    """
    Split data into training and testing sets with optional stratification.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data to use as test set.
        stratify (bool): Whether to stratify split based on y.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple: (X_train, X_test, y_train, y_test)
    """
    return train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y if stratify else None,
        random_state=random_state
    )



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y , train_size=0.8, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



# Grid Search

In [193]:
# catboost clf

grid_cat = {
    'depth':np.array(range(3, 12, 3)),
    'learning_rate': np.array(range(5, 15, 5))/100,
    'loss_function':['CrossEntropy','Logloss'],
}

model_cat = CatBoostClassifier(
    iterations=1500,
    eval_metric='Accuracy',
    verbose=200 # control how often the model will print out output - e.g. when verbose is set to 50 = every 50 iteretion print out current status
)

grid_cat = RandomizedSearchCV(estimator=model_cat, param_distributions=grid_cat, cv=None, scoring='accuracy', n_iter=10, n_jobs=-1)
grid_cat.fit(X_train_smote, y_train_smote)

0:	learn: 0.7188318	total: 193ms	remaining: 4m 48s
0:	learn: 0.7660751	total: 240ms	remaining: 5m 59s
0:	learn: 0.7135947	total: 419ms	remaining: 10m 28s
0:	learn: 0.7115944	total: 422ms	remaining: 10m 32s
0:	learn: 0.7131583	total: 421ms	remaining: 10m 30s
0:	learn: 0.6670425	total: 132ms	remaining: 3m 18s
0:	learn: 0.6508583	total: 250ms	remaining: 6m 15s
0:	learn: 0.6380201	total: 135ms	remaining: 3m 22s
0:	learn: 0.6393294	total: 97.1ms	remaining: 2m 25s
0:	learn: 0.6399113	total: 175ms	remaining: 4m 22s
200:	learn: 0.9194065	total: 17.9s	remaining: 1m 55s
200:	learn: 0.8532150	total: 17s	remaining: 1m 49s
200:	learn: 0.8360489	total: 16.8s	remaining: 1m 48s
200:	learn: 0.8350669	total: 15.8s	remaining: 1m 42s
200:	learn: 0.8363762	total: 15.7s	remaining: 1m 41s
400:	learn: 0.9387547	total: 33.2s	remaining: 1m 31s
400:	learn: 0.8794006	total: 33s	remaining: 1m 30s
400:	learn: 0.8652531	total: 31.4s	remaining: 1m 26s
400:	learn: 0.8641621	total: 31s	remaining: 1m 24s
400:	learn: 0.8


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



400:	learn: 0.8550698	total: 27.1s	remaining: 1m 14s
0:	learn: 0.6900276	total: 116ms	remaining: 2m 53s
400:	learn: 0.8367763	total: 26.5s	remaining: 1m 12s
400:	learn: 0.8356852	total: 25.1s	remaining: 1m 8s
0:	learn: 0.6859543	total: 177ms	remaining: 4m 25s
200:	learn: 0.8082267	total: 11.4s	remaining: 1m 13s
0:	learn: 0.6870090	total: 205ms	remaining: 5m 7s
600:	learn: 0.9319174	total: 37.6s	remaining: 56.2s
600:	learn: 0.8697992	total: 36.2s	remaining: 54.2s
600:	learn: 0.8541970	total: 35.2s	remaining: 52.7s
1499:	learn: 0.9999636	total: 15m 29s	remaining: 0us
600:	learn: 0.8530332	total: 34s	remaining: 50.8s
400:	learn: 0.8372491	total: 19.9s	remaining: 54.6s
0:	learn: 0.6762438	total: 169ms	remaining: 4m 12s
200:	learn: 0.9547571	total: 21.8s	remaining: 2m 20s
800:	learn: 0.9385001	total: 45.6s	remaining: 39.8s
800:	learn: 0.8796552	total: 44s	remaining: 38.4s
800:	learn: 0.8649258	total: 42.8s	remaining: 37.4s
800:	learn: 0.8652168	total: 41.4s	remaining: 36.1s
600:	learn: 0.85

In [185]:
model_cat_test = CatBoostClassifier(
    iterations=1500,
    eval_metric='Accuracy',
    verbose=300, # control how often the model will print out output - e.g. when verbose is set to 50 = every 50 iteretion print out current status
    learning_rate=0.1,
    loss_function='CrossEntropy', 
    depth=10
)

model_cat_test.fit(X_train_smote, y_train_smote, eval_set=(X_test, y_test))

In [187]:
y_pred = model_cat_test.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      4297
           1       0.60      0.30      0.40      1079

    accuracy                           0.82      5376
   macro avg       0.72      0.62      0.65      5376
weighted avg       0.79      0.82      0.79      5376



In [194]:
# XGBClassifier

xgb.set_config(verbosity=1)

param_grid = {
    'n_estimators': np.array(range(100, 2000, 100)),
    'max_depth': np.array(range(1, 20, 1)),
    'learning_rate': np.array(range(5, 25, 5))/100,
    'eval_metric': ['logloss', 'error', 'auc'],
}

model = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

grid_XGB = RandomizedSearchCV(model, param_grid, cv=None, scoring='accuracy', n_iter=10, n_jobs=-1)
grid_XGB.fit(X_train_smote, y_train_smote)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [195]:
##### logistic regression model
model_logit = LogisticRegression()
model_logit.fit(X_train_smote, y_train_smote)

##### Decision Tree
param_grid = {
    'max_depth': np.array(range(1, 20, 1)),
    'criterion': ['gini', 'entropy']
}

model_dt = DecisionTreeClassifier(random_state=42)

grid_dt = RandomizedSearchCV(model_dt, param_grid, cv=None, scoring='accuracy', n_iter=10)
grid_dt.fit(X_train_smote, y_train_smote)

##### Random Forest
param_grid = {
    'n_estimators': np.array(range(100, 2000, 100)),
    'max_depth': np.array(range(1, 20, 1)),
    'criterion': ['gini', 'entropy']
}

model_rf = RandomForestClassifier(random_state=42)

grid_rf = RandomizedSearchCV(model_rf, param_grid, cv=None, scoring='accuracy', n_iter=10)
grid_rf.fit(X_train_smote, y_train_smote)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [196]:
grid_rf.best_params_

{'n_estimators': np.int64(1000),
 'max_depth': np.int64(19),
 'criterion': 'gini'}

In [201]:
best_dt = grid_dt.best_estimator_
best_rf = grid_rf.best_estimator_
best_cat = grid_cat.best_estimator_
best_XGB = grid_XGB.best_estimator_

In [213]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

def evaluate_model_separate(list_model, list_model_name, X_test, y_test):
    """
    Evaluates classification models and returns:
    - DataFrame with metrics for class '0' and '1'
    - DataFrame with overall accuracy per model
    - Dictionary with confusion matrices per model

    Parameters:
    - list_model: list of trained model objects
    - list_model_name: list of corresponding model names
    - X_test: test features
    - y_test: test labels

    Returns:
    - df_class: DataFrame with metrics for class '0' and '1'
    - df_accuracy: DataFrame with overall accuracy per model
    - cm_dict: Dictionary of confusion matrices {model_name: matrix}
    """
    class_rows = []
    accuracy_rows = []
    cm_dict = {}

    for model, name in zip(list_model, list_model_name):
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        cm_dict[name] = cm  # store confusion matrix in dictionary

        for label, metrics in report.items():
            if label in ['0', '1']:
                class_rows.append({
                    'model': name,
                    'class': label,
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1-score': metrics['f1-score'],
                    'support': metrics['support']
                })
            elif label == 'accuracy':
                accuracy_rows.append({
                    'model': name,
                    'accuracy': metrics
                })

    df_class = pd.DataFrame(class_rows)
    df_accuracy = pd.DataFrame(accuracy_rows)
    return df_class, df_accuracy, cm_dict

In [216]:
list_model = [model_logit, best_dt, best_rf, best_cat, best_XGB]
list_model_name = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'CatBoost', 'XGBoost']

df_class, df_accuracy, cm_dict = evaluate_model_separate(list_model, list_model_name, X_test, y_test)

In [217]:
cm_dict

{'Logistic Regression': array([[3874,  423],
        [ 883,  196]]),
 'Decision Tree': array([[3580,  717],
        [ 626,  453]]),
 'Random Forest': array([[3898,  399],
        [ 591,  488]]),
 'CatBoost': array([[4073,  224],
        [ 759,  320]]),
 'XGBoost': array([[4018,  279],
        [ 690,  389]])}

In [205]:
from sklearn import 

Unnamed: 0,model,accuracy
0,Logistic Regression,0.757068
1,Decision Tree,0.750186
2,Random Forest,0.815848
3,CatBoost,0.81715
4,XGBoost,0.819754


In [None]:
best_