In [113]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime as dt

from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE

# Read Data

In [2]:
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')

In [3]:
df_payments[(df_payments['user'].isna())&(df_payments['credits']>=500)]
df_payments[(df_payments['user'].isna())&(df_payments['credits']<500)]
df_payments[(df_payments['user'].isna())&(df_payments['credits'].isna())]

df_payments[df_payments['id'].isnull()]

Unnamed: 0,id,created_at,changed_at,user,batch,credits,state,type
1424,,Anna,Prchalová,,1299589028/3030,,,


In [4]:
df_atlas.columns

Index(['user_id', 'all_intro-2022', 'atlas_vzorek-2023', 'atlas_vzorek-2024',
       'all_intro-trendaro_panel_all', 'sex-woman', 'sex-man', 'age-15_17',
       'age-18_24', 'age-25_34',
       ...
       'check-tv', 'check-radio', 'check-household', 'check-overdraft',
       'check-naramek', 'isic2_age-21_26', 'isic2_age-15_20', 'isic_age-15_17',
       'isic_age-18_22', 'isic_age-23_26'],
      dtype='object', length=1121)

In [80]:
df_credits

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
0,STUD66006,25,1,1,0,Peníze
1,STUD22095,51,1,1,0,Peníze
2,STUD77411,0,1,1,1,Peníze
3,STUD56329,0,1,0,0,
4,STUD23516,30,1,1,0,Peníze
...,...,...,...,...,...,...
89906,STUD54678,789,1,1,0,Peníze
89907,STUD43709,0,0,1,0,
89908,STUD21881,0,0,0,0,
89909,STUD72037,16,1,1,0,Lékaři bez hranic


# Negative Values investigation & Chekcing other things

In [5]:
df_c_negative = df_credits[df_credits['credits']<0]
df_credits[df_credits.user.isin(df_c_negative.user) == True]

# IDK what I'm doing with payments
df_payments[df_payments.user.isin(df_c_negative.user)==True].sort_values(['user','created_at'])
df_payments.state.unique()
df_payments[df_payments.user=='STUD54678']
df_credits[df_credits.user=='STUD54678']

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
89906,STUD54678,789,1,1,0,Peníze


In [6]:
df_c_negative # from df_credits

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
15877,STUD26214,-416,1,1,0,Peníze
28959,STUD16141,-10,1,1,0,Peníze
71540,STUD91415,-452,1,1,0,Peníze
72659,STUD99479,-441,1,1,0,Peníze
73458,STUD10440,-4,1,1,0,Peníze


In [7]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30087 entries, 0 to 30086
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          30086 non-null  float64
 1   created_at  30087 non-null  object 
 2   changed_at  30087 non-null  object 
 3   user        27652 non-null  object 
 4   batch       27502 non-null  object 
 5   credits     30086 non-null  float64
 6   state       30086 non-null  object 
 7   type        30086 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.8+ MB


# Data Cleaning

In [8]:
##### In the dataset we found out that in payments dataset values that are being cancelled has creadits = 0 and NaN user having credit 430 and still being PAID. However, that user doesn't have user_id

df_payments[df_payments['credits']<500].groupby('state').size()

state
CANCELLED    2000
PAID            1
dtype: int64

In [9]:
df_credits_cleaned = df_credits[df_credits['credits']>0]


"""
df_payments_cleaned = df_payments[(df_payments['user'].isna())&(df_payments['credits']<500)]
df_payments_cleaned = df_payments[(df_payments['user'].isna())&(df_payments['credits']>=500)]
df_payments_cleaned[(df_payments['user'].isna())&(df_payments['credits'].isna())]

-> everything cane be done using: df_payments_cleaned = df_payments[df_payments['user'].notna()]

"""

df_payments_cleaned = df_payments[df_payments['user'].notna()] # we threw out from payments 2345 observations


In [10]:
df_payments_cleaned

Unnamed: 0,id,created_at,changed_at,user,batch,credits,state,type
0,30116.0,2025-04-07 10:32:05.073604+00:00,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,MONEY
1,30115.0,2025-04-07 10:30:09.304166+00:00,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,MONEY
2,30114.0,2025-04-07 09:57:04.343935+00:00,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,MONEY
3,30113.0,2025-04-07 09:15:39.069868+00:00,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,MONEY
4,30112.0,2025-04-07 08:28:47.838506+00:00,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,MONEY
...,...,...,...,...,...,...,...,...
30080,36.0,2017-08-11 14:55:18.396493+00:00,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,MONEY
30081,35.0,2017-08-11 14:54:35.456650+00:00,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,MONEY
30082,34.0,2017-08-11 14:06:49.193910+00:00,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,MONEY
30085,31.0,2017-08-10 21:58:35.586238+00:00,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,MEDICINS_SANS_FRONTIERS


# What is core table for merge? - Payments // Can't I just use inner join? Therefore I dont' have to work with nul values - I don't want them in model anyway

In [11]:
#I have an assumption that all users should be in Atlas Cechu. However, after cheking my assumption it tourns out not to be true. 

# Convert the boolean series to a DataFrame with a column name
atlas_in_credits = df_credits_cleaned.user.isin(df_atlas.user_id).rename('User in Atlas')
atlas_in_payments = df_payments_cleaned.user.isin(df_atlas.user_id)

# Plotting
fig_ac = px.bar(atlas_in_credits.value_counts(), title='How many users from Atlas Cechu are in the credits table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})
fig_ap = px.bar(atlas_in_payments.value_counts(), title='How many users from Atlas Cechu are in the payments table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})

fig_ac.show()
fig_ap.show()

In [12]:
df_atlas.user_id.isin(df_credits_cleaned.user).value_counts()

user_id
True     40672
False     2861
Name: count, dtype: int64

In [13]:
df_atlas.user_id.isin(df_payments_cleaned.user).value_counts()

user_id
False    34824
True      8709
Name: count, dtype: int64

Since we want to find out when who pulls what, it makes sense to use the payments table and connect data to it - even though there are users in payments tables that are not in Atlas Cechu

In [88]:
#df_merge_credits = pd.merge(df_payments_cleaned, df_credits_cleaned, how="inner", left_on='user', right_on='user')
#df_merge_credits[df_merge_credits.user=='STUD92308']

"""
I don't find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions
"""

'\nI don\'t find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions\n'

In [14]:
df_merge_full = pd.merge(df_payments_cleaned, df_atlas, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})


# df_merge_full.to_csv('../data_output/df_merge_full.csv')


df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])

df_merge_full['day'] = df_merge_full['created_at'].dt.day
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year
df_merge_full['hour'] = df_merge_full['created_at'].dt.hour
df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday


df_merge_full = df_merge_full.drop(columns=['created_at'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [15]:
df_merge_full

Unnamed: 0,id,changed_at,user,batch,credits,state,user_id,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,...,isic_age-23_26,type_HANDIPET,type_MEDICINS_SANS_FRONTIERS,type_MONEY,type_TRENDARO,day,month,year,hour,weekday
0,30116.0,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,STUD83031,1,0,0,...,0,0,0,1,0,7,4,2025,10,0
1,30115.0,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,STUD356,1,0,1,...,0,0,0,1,0,7,4,2025,10,0
2,30114.0,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,STUD25955,1,0,0,...,0,0,0,1,0,7,4,2025,9,0
3,30113.0,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,STUD45618,1,0,0,...,0,0,0,1,0,7,4,2025,9,0
4,30112.0,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,STUD44458,1,0,0,...,0,0,0,1,0,7,4,2025,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26874,36.0,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,STUD49275,1,0,0,...,0,0,0,1,0,11,8,2017,14,4
26875,35.0,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,STUD53465,1,0,0,...,0,0,0,1,0,11,8,2017,14,4
26876,34.0,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,STUD29324,1,0,1,...,0,0,0,1,0,11,8,2017,14,4
26877,31.0,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,STUD92308,1,1,0,...,0,0,1,0,0,10,8,2017,21,3


In [16]:
# split dataset into three categories

df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user', 'batch', 'state', 'user_id'])
df_modeling['credits_601+'] = (df_modeling['credits'] > 600).astype(int)
df_modeling = df_modeling.drop(columns=['credits'])

fig = px.bar(pd.DataFrame(df_modeling['credits_601+'].value_counts().reset_index()), x='credits_601+', y='count', title='Credits Category Distribution')
fig.show()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [17]:
df_modeling['credits_601+'].unique()

array([0, 1])

In [20]:
df_modeling = df_modeling[df_modeling['credits_601+'].isnull()==False] # clean data from null values

# Modeling

In [21]:
y = df_modeling['credits_601+']
X = df_modeling.drop(columns=['credits_601+'])

def split_data(X, y, test_size=0.2, stratify=True, random_state=42):
    """
    Split data into training and testing sets with optional stratification.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data to use as test set.
        stratify (bool): Whether to stratify split based on y.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple: (X_train, X_test, y_train, y_test)
    """
    return train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y if stratify else None,
        random_state=random_state
    )



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y , train_size=0.8, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Grid Search

In [26]:
# catboost clf

grid_cat = {
    'depth':np.array(range(1, 20, 1)),
    'learning_rate': np.array(range(5, 25, 5))/100,
    'loss_function':['CrossEntropy','Logloss'],
}

model_cat = CatBoostClassifier(
    iterations=500,
    eval_metric='Accuracy',
    verbose=100 # control how often the model will print out output - e.g. when verbose is set to 50 = every 50 iteretion print out current status
)

grid_cat = RandomizedSearchCV(estimator=model_cat, param_distributions=grid_cat, cv=3, scoring='accuracy', n_iter=15, n_jobs=-1)
grid_cat.fit(X_train_smote, y_train_smote)

#model.fit(X_train, y_train, eval_set=(X_test, y_test))
print(f'best params: {grid_cat.best_params_}')
print(f'best accuracy: {grid_cat.best_score_}')

0:	learn: 0.7880712	total: 1.4s	remaining: 11m 39s
100:	learn: 0.9771189	total: 1m 47s	remaining: 7m 4s
200:	learn: 0.9988141	total: 3m 34s	remaining: 5m 18s
300:	learn: 1.0000000	total: 5m 22s	remaining: 3m 33s
400:	learn: 1.0000000	total: 7m 6s	remaining: 1m 45s
499:	learn: 1.0000000	total: 8m 46s	remaining: 0us
0:	learn: 0.7750262	total: 1.01s	remaining: 8m 25s
100:	learn: 0.9761423	total: 1m 40s	remaining: 6m 35s
200:	learn: 0.9995814	total: 3m 21s	remaining: 4m 59s
300:	learn: 1.0000000	total: 5m	remaining: 3m 18s
400:	learn: 1.0000000	total: 6m 40s	remaining: 1m 38s
499:	learn: 1.0000000	total: 8m 18s	remaining: 0us
0:	learn: 0.7638114	total: 9.61ms	remaining: 4.8s
100:	learn: 0.9777483	total: 9.61ms	remaining: 4.8s
200:	learn: 0.9995117	total: 9.61ms	remaining: 4.8s
300:	learn: 0.9999302	total: 9.61ms	remaining: 4.8s
400:	learn: 0.9999302	total: 12.2s	remaining: 1m 26s
499:	learn: 0.9999302	total: 1m 44s	remaining: 0us
0:	learn: 0.7642135	total: 12.3ms	remaining: 6.13s
100:	lear



9 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/vanhieuvu/Documents/school/Magisterský studium/2.semestr/datovy_projekt/dp_env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/vanhieuvu/Documents/school/Magisterský studium/2.semestr/datovy_projekt/dp_env/lib/python3.9/site-packages/catboost/core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/Users/vanhieuvu/Documents/scho

0:	learn: 0.7642655	total: 13.9ms	remaining: 6.94s
100:	learn: 0.7816119	total: 1.4s	remaining: 5.53s
200:	learn: 0.8228154	total: 2.7s	remaining: 4.02s
300:	learn: 0.8495559	total: 4.04s	remaining: 2.67s
400:	learn: 0.8731805	total: 5.34s	remaining: 1.32s
499:	learn: 0.8911780	total: 6.64s	remaining: 0us
best params: {'loss_function': 'Logloss', 'learning_rate': np.float64(0.1), 'depth': np.int64(6)}
best accuracy: 0.7901688322117079


In [None]:
# XGBClassifier

xgb.set_config(verbosity=1)

param_grid = {
    'n_estimators': np.array(range(100, 2000, 100)),
    'max_depth': np.array(range(1, 20, 1)),
    'learning_rate': np.array(range(5, 25, 5))/100,
    'eval_metric': ['logloss', 'error', 'auc'],
}

model = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

grid_XGB = RandomizedSearchCV(model, param_grid, cv=3, scoring='accuracy', n_iter=20, n_jobs=-1)
grid_XGB.fit(X_train_smote, y_train_smote)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [78]:
##### logistic regression model
model_logit = LogisticRegression()
model_logit.fit(X_train_smote, y_train_smote)

##### Decision Tree
param_grid = {
    'max_depth': np.array(range(1, 20, 1)),
    'criterion': ['gini', 'entropy']
}

model_dt = DecisionTreeClassifier(random_state=42)

grid_dt = RandomizedSearchCV(model_dt, param_grid, cv=3, scoring='accuracy', n_iter=20)
grid_dt.fit(X_train_smote, y_train_smote)

##### Random Forest
param_grid = {
    'n_estimators': np.array(range(100, 2000, 100)),
    'max_depth': np.array(range(1, 20, 1)),
    'criterion': ['gini', 'entropy']
}

model_rf = RandomForestClassifier(random_state=42)

grid_rf = RandomizedSearchCV(model_rf, param_grid, cv=3, scoring='accuracy', n_iter=20)
grid_rf.fit(X_train_smote, y_train_smote)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [79]:
best_dt = grid_dt.best_estimator_
best_rf = grid_rf.best_estimator_
best_cat = grid_cat.best_estimator_
best_XGB = grid_XGB.best_estimator_

In [105]:
def evaluate_model_separate(list_model, list_model_name, X_test, y_test):
    """
    Evaluates classification models and returns two separate DataFrames:
    one with class-level metrics ('0' and '1'), and another with overall accuracy.

    Parameters:
    - list_model: list of trained model objects
    - list_model_name: list of corresponding model names
    - X_test: test features
    - y_test: test labels

    Returns:
    - df_class: DataFrame with metrics for class '0' and '1'
    - df_accuracy: DataFrame with overall accuracy per model
    """
    class_rows = []
    accuracy_rows = []

    for model, name in zip(list_model, list_model_name):
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        for label, metrics in report.items():
            if label in ['0', '1']:
                class_rows.append({
                    'model': name,
                    'class': label,
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1-score': metrics['f1-score'],
                    'support': metrics['support']
                })
            elif label == 'accuracy':
                accuracy_rows.append({
                    'model': name,
                    'accuracy': metrics
                })

    df_class = pd.DataFrame(class_rows)
    df_accuracy = pd.DataFrame(accuracy_rows)
    return df_class, df_accuracy

In [None]:
list_model = [model_logit, best_dt, best_rf, best_cat, best_XGB]
list_model_name = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'CatBoost', 'XGBoost']

df_class, df_accuracy = evaluate_model_separate(list_model, list_model_name, X_test, y_test)


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [108]:
df_class

Unnamed: 0,model,class,precision,recall,f1-score,support
0,Logistic Regression,0,0.766773,0.993668,0.865599,4106.0
1,Logistic Regression,1,0.527273,0.022835,0.043774,1270.0
2,Decision Tree,0,0.763765,1.0,0.866062,4106.0
3,Decision Tree,1,0.0,0.0,0.0,1270.0
4,Random Forest,0,0.81601,0.948368,0.877225,4106.0
5,Random Forest,1,0.649007,0.308661,0.418356,1270.0
6,CatBoost,0,0.813665,0.957136,0.879588,4106.0
7,CatBoost,1,0.677656,0.291339,0.407489,1270.0
8,XGBoost,0,0.820535,0.934243,0.873705,4106.0
9,XGBoost,1,0.614836,0.33937,0.437341,1270.0
