In [244]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime as dt

from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 

from catboost import CatBoostClassifier


# Read Data

In [181]:
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')

In [150]:
df_payments[(df_payments['user'].isna())&(df_payments['credits']>=500)]
df_payments[(df_payments['user'].isna())&(df_payments['credits']<500)]
df_payments[(df_payments['user'].isna())&(df_payments['credits'].isna())]

df_payments[df_payments['id'].isnull()]

Unnamed: 0,id,created_at,changed_at,user,batch,credits,state,type
1572,28545.0,2025-03-03 14:18:05.497799+00:00,2025-04-04 00:15:42.107009+00:00,,,693.0,APPROVED,MEDICINS_SANS_FRONTIERS
2093,28024.0,2025-02-18 09:46:31.403111+00:00,2025-03-31 00:15:56.283352+00:00,,285,505.0,PAID,MONEY
2740,27377.0,2025-02-02 13:16:34.341371+00:00,2025-03-09 01:18:13.861829+00:00,,281,506.0,PAID,MONEY
3095,27022.0,2025-01-24 12:28:51.076215+00:00,2025-03-22 01:17:47.235645+00:00,,280,505.0,PAID,MONEY
3501,26616.0,2025-01-12 20:28:30.111860+00:00,2025-02-16 01:15:56.035881+00:00,,278,525.0,PAID,MONEY
...,...,...,...,...,...,...,...,...
30072,44.0,2017-08-19 21:56:31.771008+00:00,2024-01-10 17:31:11.673796+00:00,,5,526.0,PAID,MONEY
30074,42.0,2017-08-13 20:27:00.570002+00:00,2024-01-10 17:31:11.696886+00:00,,5,531.0,PAID,MONEY
30079,37.0,2017-08-11 21:11:51.521635+00:00,2024-01-10 17:31:11.778460+00:00,,5,541.0,PAID,MONEY
30083,33.0,2017-08-11 03:05:08.659811+00:00,2024-12-22 01:16:00.780947+00:00,,,536.0,PAID,CHILDHOOD_HOME


In [79]:
df_atlas.columns

Index(['user_id', 'all_intro-2022', 'atlas_vzorek-2023', 'atlas_vzorek-2024',
       'all_intro-trendaro_panel_all', 'sex-woman', 'sex-man', 'age-15_17',
       'age-18_24', 'age-25_34',
       ...
       'check-tv', 'check-radio', 'check-household', 'check-overdraft',
       'check-naramek', 'isic2_age-21_26', 'isic2_age-15_20', 'isic_age-15_17',
       'isic_age-18_22', 'isic_age-23_26'],
      dtype='object', length=1121)

In [80]:
df_credits

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
0,STUD66006,25,1,1,0,Peníze
1,STUD22095,51,1,1,0,Peníze
2,STUD77411,0,1,1,1,Peníze
3,STUD56329,0,1,0,0,
4,STUD23516,30,1,1,0,Peníze
...,...,...,...,...,...,...
89906,STUD54678,789,1,1,0,Peníze
89907,STUD43709,0,0,1,0,
89908,STUD21881,0,0,0,0,
89909,STUD72037,16,1,1,0,Lékaři bez hranic


# Negative Values investigation & Chekcing other things

In [81]:
df_c_negative = df_credits[df_credits['credits']<0]
df_credits[df_credits.user.isin(df_c_negative.user) == True]

# IDK what I'm doing with payments
df_payments[df_payments.user.isin(df_c_negative.user)==True].sort_values(['user','created_at'])
df_payments.state.unique()
df_payments[df_payments.user=='STUD54678']
df_credits[df_credits.user=='STUD54678']

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
89906,STUD54678,789,1,1,0,Peníze


In [133]:
df_c_negative # from df_credits

Unnamed: 0,user,credits,is_active,is_verified,is_locked,wage
15877,STUD26214,-416,1,1,0,Peníze
28959,STUD16141,-10,1,1,0,Peníze
71540,STUD91415,-452,1,1,0,Peníze
72659,STUD99479,-441,1,1,0,Peníze
73458,STUD10440,-4,1,1,0,Peníze


In [82]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30087 entries, 0 to 30086
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          30086 non-null  float64
 1   created_at  30087 non-null  object 
 2   changed_at  30087 non-null  object 
 3   user        27652 non-null  object 
 4   batch       27502 non-null  object 
 5   credits     30086 non-null  float64
 6   state       30086 non-null  object 
 7   type        30086 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.8+ MB


# Data Cleaning

In [83]:
##### In the dataset we found out that in payments dataset values that are being cancelled has creadits = 0 and NaN user having credit 430 and still being PAID. However, that user doesn't have user_id

df_payments[df_payments['credits']<500].groupby('state').size()

state
CANCELLED    2000
PAID            1
dtype: int64

In [154]:
df_credits_cleaned = df_credits[df_credits['credits']>0]


"""
df_payments_cleaned = df_payments[(df_payments['user'].isna())&(df_payments['credits']<500)]
df_payments_cleaned = df_payments[(df_payments['user'].isna())&(df_payments['credits']>=500)]
df_payments_cleaned[(df_payments['user'].isna())&(df_payments['credits'].isna())]

-> everything cane be done using: df_payments_cleaned = df_payments[df_payments['user'].notna()]

"""

df_payments_cleaned = df_payments[df_payments['user'].notna()] # we threw out from payments 2345 observations


In [185]:
df_payments_cleaned

Unnamed: 0,id,created_at,changed_at,user,batch,credits,state,type
0,30116.0,2025-04-07 10:32:05.073604+00:00,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,MONEY
1,30115.0,2025-04-07 10:30:09.304166+00:00,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,MONEY
2,30114.0,2025-04-07 09:57:04.343935+00:00,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,MONEY
3,30113.0,2025-04-07 09:15:39.069868+00:00,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,MONEY
4,30112.0,2025-04-07 08:28:47.838506+00:00,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,MONEY
...,...,...,...,...,...,...,...,...
30080,36.0,2017-08-11 14:55:18.396493+00:00,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,MONEY
30081,35.0,2017-08-11 14:54:35.456650+00:00,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,MONEY
30082,34.0,2017-08-11 14:06:49.193910+00:00,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,MONEY
30085,31.0,2017-08-10 21:58:35.586238+00:00,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,MEDICINS_SANS_FRONTIERS


# What is core table for merge? - Payments // Can't I just use inner join? Therefore I dont' have to work with nul values - I don't want them in model anyway

In [85]:
#I have an assumption that all users should be in Atlas Cechu. However, after cheking my assumption it tourns out not to be true. 

# Convert the boolean series to a DataFrame with a column name
atlas_in_credits = df_credits_cleaned.user.isin(df_atlas.user_id).rename('User in Atlas')
atlas_in_payments = df_payments_cleaned.user.isin(df_atlas.user_id)

# Plotting
fig_ac = px.bar(atlas_in_credits.value_counts(), title='How many users from Atlas Cechu are in the credits table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})
fig_ap = px.bar(atlas_in_payments.value_counts(), title='How many users from Atlas Cechu are in the payments table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})

fig_ac.show()
fig_ap.show()

In [86]:
df_atlas.user_id.isin(df_credits_cleaned.user).value_counts()

user_id
True     40672
False     2861
Name: count, dtype: int64

In [87]:
df_atlas.user_id.isin(df_payments_cleaned.user).value_counts()

user_id
False    34824
True      8709
Name: count, dtype: int64

Since we want to find out when who pulls what, it makes sense to use the payments table and connect data to it - even though there are users in payments tables that are not in Atlas Cechu

In [88]:
#df_merge_credits = pd.merge(df_payments_cleaned, df_credits_cleaned, how="inner", left_on='user', right_on='user')
#df_merge_credits[df_merge_credits.user=='STUD92308']

"""
I don't find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions
"""

'\nI don\'t find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions\n'

In [186]:
df_merge_full = pd.merge(df_payments_cleaned, df_atlas, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})


# df_merge_full.to_csv('../data_output/df_merge_full.csv')


df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])

df_merge_full['day'] = df_merge_full['created_at'].dt.day
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year
df_merge_full['hour'] = df_merge_full['created_at'].dt.hour
df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday


df_merge_full = df_merge_full.drop(columns=['created_at'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [187]:
df_merge_full

Unnamed: 0,id,changed_at,user,batch,credits,state,user_id,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,...,isic_age-23_26,type_HANDIPET,type_MEDICINS_SANS_FRONTIERS,type_MONEY,type_TRENDARO,day,month,year,hour,weekday
0,30116.0,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,STUD83031,1,0,0,...,0,0,0,1,0,7,4,2025,10,0
1,30115.0,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,STUD356,1,0,1,...,0,0,0,1,0,7,4,2025,10,0
2,30114.0,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,STUD25955,1,0,0,...,0,0,0,1,0,7,4,2025,9,0
3,30113.0,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,STUD45618,1,0,0,...,0,0,0,1,0,7,4,2025,9,0
4,30112.0,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,STUD44458,1,0,0,...,0,0,0,1,0,7,4,2025,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26874,36.0,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,STUD49275,1,0,0,...,0,0,0,1,0,11,8,2017,14,4
26875,35.0,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,STUD53465,1,0,0,...,0,0,0,1,0,11,8,2017,14,4
26876,34.0,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,STUD29324,1,0,1,...,0,0,0,1,0,11,8,2017,14,4
26877,31.0,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,STUD92308,1,1,0,...,0,0,1,0,0,10,8,2017,21,3


In [167]:
# split dataset into three categories

df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user', 'batch', 'state', 'user_id'])
df_modeling['credits_601+'] = (df_modeling['credits'] > 600).astype(int)
df_modeling = df_modeling.drop(columns=['credits'])

fig = px.bar(pd.DataFrame(df_modeling['credits_601+'].value_counts().reset_index()), x='credits_601+', y='count', title='Credits Category Distribution')
fig.show()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [191]:
df_modeling['credits_601+'].unique()

array([0, 1])

In [95]:
df_modeling = df_modeling[df_modeling['credits_category'].isnull()==False] # clean data from null values

# Modeling

In [209]:
y = df_modeling['credits_601+']
X = df_modeling.drop(columns=['credits_601+'])

def split_data(X, y, test_size=0.2, stratify=True, random_state=42):
    """
    Split data into training and testing sets with optional stratification.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data to use as test set.
        stratify (bool): Whether to stratify split based on y.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple: (X_train, X_test, y_train, y_test)
    """
    return train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y if stratify else None,
        random_state=random_state
    )



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y , train_size=0.8, random_state=42)

# Grid Search

In [245]:
# catboost clf

grid_cat = {
    'depth':[3,6,9],
    'learning_rate': np.array(range(5, 25, 5))/100,
    'loss_function':['CrossEntropy','Logloss'],
}

model_cat = CatBoostClassifier(
    iterations=500,
    eval_metric='Accuracy',
    verbose=100 # control how often the model will print out output - e.g. when verbose is set to 50 = every 50 iteretion print out current status
)

grid_cat = GridSearchCV(estimator=model_cat, param_grid=grid_cat, cv=3, scoring='accuracy')
grid_cat.fit(X_train, y_train)

#model.fit(X_train, y_train, eval_set=(X_test, y_test))
print(f'best params: {grid_cat.best_params_}')
print(f'best accuracy: {grid_cat.best_score_}')

0:	learn: 0.7479899	total: 22.9ms	remaining: 11.4s
100:	learn: 0.7480643	total: 468ms	remaining: 1.85s
200:	learn: 0.7514145	total: 920ms	remaining: 1.37s
300:	learn: 0.7550625	total: 1.4s	remaining: 924ms
400:	learn: 0.7612418	total: 1.81s	remaining: 446ms
499:	learn: 0.7681656	total: 2.21s	remaining: 0us
0:	learn: 0.7479342	total: 7.02ms	remaining: 3.5s
100:	learn: 0.7479342	total: 424ms	remaining: 1.68s
200:	learn: 0.7486786	total: 829ms	remaining: 1.23s
300:	learn: 0.7529964	total: 1.33s	remaining: 880ms
400:	learn: 0.7583563	total: 1.73s	remaining: 428ms
499:	learn: 0.7629718	total: 2.14s	remaining: 0us
0:	learn: 0.7480086	total: 7.16ms	remaining: 3.57s
100:	learn: 0.7480086	total: 418ms	remaining: 1.65s
200:	learn: 0.7499442	total: 805ms	remaining: 1.2s
300:	learn: 0.7552297	total: 1.2s	remaining: 796ms
400:	learn: 0.7602918	total: 1.58s	remaining: 391ms
499:	learn: 0.7666195	total: 1.99s	remaining: 0us
0:	learn: 0.7479899	total: 4.01ms	remaining: 2s
100:	learn: 0.7480643	total: 

In [195]:
# CatBoost
model_cat = CatBoostClassifier(
    iterations=1000,
    eval_metric='Accuracy',
    verbose=200,
    depth=7,
    learning_rate=0.1,
    loss_function='CrossEntropy'
)
model_cat.fit(X_train, y_train, eval_set=(X_test, y_test))

0:	learn: 0.7487220	test: 0.7481143	best: 0.7481143 (0)	total: 27.4ms	remaining: 27.3s
200:	learn: 0.8483299	test: 0.7814609	best: 0.7814609 (200)	total: 5.61s	remaining: 22.3s
400:	learn: 0.9163730	test: 0.7888051	best: 0.7911870 (376)	total: 11s	remaining: 16.5s
600:	learn: 0.9591047	test: 0.7941644	best: 0.7943628 (596)	total: 16.3s	remaining: 10.8s
800:	learn: 0.9820338	test: 0.7929734	best: 0.7963478 (697)	total: 21.9s	remaining: 5.43s
999:	learn: 0.9927044	test: 0.7939659	best: 0.7963478 (697)	total: 27.1s	remaining: 0us

bestTest = 0.796347757
bestIteration = 697

Shrink model to first 698 iterations.


<catboost.core.CatBoostClassifier at 0x32038f850>

In [243]:
# XGBClassifier

xgb.set_config(verbosity=1)

param_grid = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [3, 9, 15],
    'learning_rate': [0.05, 0.1, 0.15],
    'eval_metric': ['logloss', 'error', 'auc'],
}

model = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

grid_search = RandomizedSearchCV(model, param_grid, cv=3, scoring='accuracy', n_iter=60)
grid_search.fit(X_train, y_train)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_:.4f}')

# ✅ Predict and evaluate using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

NameError: name 'RandomizedSearchCV' is not defined

In [210]:
# logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

#Accuracy: 0.7457324335053592

              precision    recall  f1-score   support

           0       0.76      0.97      0.85      3768
           1       0.48      0.09      0.15      1270

    accuracy                           0.75      5038
   macro avg       0.62      0.53      0.50      5038
weighted avg       0.69      0.75      0.67      5038

Accuracy: 0.7457324335053592



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [237]:

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'criterion': ['gini', 'entropy']
}

# Define the base model
model_dt = DecisionTreeClassifier(random_state=42)

# Run grid search
grid_search = GridSearchCV(model_dt, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best model and predict
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81      3768
           1       0.45      0.46      0.45      1270

    accuracy                           0.72      5038
   macro avg       0.63      0.63      0.63      5038
weighted avg       0.72      0.72      0.72      5038

Accuracy: 0.7213179833267169
Node Count: 6473
Max Depth: 30


In [235]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'criterion': ['gini', 'entropy']
}

# Base model
model_rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(model_rf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Use the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Best Parameters:", grid_search.best_params_)

              precision    recall  f1-score   support

           0       0.82      0.91      0.87      3768
           1       0.61      0.42      0.50      1270

    accuracy                           0.79      5038
   macro avg       0.72      0.67      0.68      5038
weighted avg       0.77      0.79      0.77      5038

Accuracy: 0.7874156411274316
