In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostClassifier

In [2]:
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')

In [3]:
df_credits_cleaned = df_credits[df_credits['credits']>0]
df_payments_cleaned = df_payments[df_payments['user'].notna()]

In [4]:
df_merge_full = pd.merge(df_payments_cleaned, df_atlas, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})
df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year

  df_merge_full['month'] = df_merge_full['created_at'].dt.month
  df_merge_full['year'] = df_merge_full['created_at'].dt.year


In [5]:
df_merge_ml = df_merge_full.drop(columns=['created_at'])

In [6]:
df_merge_ml

Unnamed: 0,id,changed_at,user,batch,credits,state,user_id,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,...,isic2_age-15_20,isic_age-15_17,isic_age-18_22,isic_age-23_26,type_HANDIPET,type_MEDICINS_SANS_FRONTIERS,type_MONEY,type_TRENDARO,month,year
0,30116.0,2025-04-07 10:32:05.073633+00:00,STUD83031,,509.0,REQUEST,STUD83031,1,0,0,...,0,0,0,0,0,0,1,0,4,2025
1,30115.0,2025-04-07 10:30:09.304196+00:00,STUD356,,538.0,REQUEST,STUD356,1,0,1,...,0,0,0,0,0,0,1,0,4,2025
2,30114.0,2025-04-07 09:57:04.343985+00:00,STUD25955,,542.0,REQUEST,STUD25955,1,0,0,...,0,0,0,0,0,0,1,0,4,2025
3,30113.0,2025-04-07 09:15:39.069910+00:00,STUD45618,,501.0,REQUEST,STUD45618,1,0,0,...,0,0,0,0,0,0,1,0,4,2025
4,30112.0,2025-04-07 08:28:47.838578+00:00,STUD44458,,504.0,REQUEST,STUD44458,1,0,0,...,0,0,0,0,0,0,1,0,4,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26874,36.0,2024-01-10 17:31:11.782802+00:00,STUD49275,5,506.0,PAID,STUD49275,1,0,0,...,0,0,0,0,0,0,1,0,8,2017
26875,35.0,2024-01-10 17:31:11.801715+00:00,STUD53465,5,523.0,PAID,STUD53465,1,0,0,...,0,0,0,0,0,0,1,0,8,2017
26876,34.0,2024-01-10 17:31:11.820777+00:00,STUD29324,5,511.0,PAID,STUD29324,1,0,1,...,0,0,0,0,0,0,1,0,8,2017
26877,31.0,2024-01-10 17:31:11.867765+00:00,STUD92308,,511.0,PAID,STUD92308,1,1,0,...,0,0,0,0,0,1,0,0,8,2017


In [7]:
def get_split_values(df_merge_full, value:int):
    col_name = f'credits_{value}+'
    df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'batch', 'state', 'user_id'])
    df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)
    df_modeling = df_modeling.drop(columns=['credits'])

    fig = px.bar(pd.DataFrame(df_modeling[col_name].value_counts().reset_index()), x=col_name, y='count', title='Credits Category Distribution')
    fig.show()
    
    df_modeling = df_modeling[df_modeling[col_name].isnull()==False] # clean data from null values
    return df_modeling, col_name

df_modeling, colname = get_split_values(df_merge_ml, 700)

  df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)


In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

y = df_modeling[colname]
X = df_modeling.drop(columns=[colname, 'user'])

def split_data(X, y, test_size=0.2, stratify=True, random_state=42):
    """
    Split data into training and testing sets with optional stratification.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data to use as test set.
        stratify (bool): Whether to stratify split based on y.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple: (X_train, X_test, y_train, y_test)s
    """
    return train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y if stratify else None,
        random_state=random_state
    )

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y , train_size=0.8, random_state=42)

def get_smote_train(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    return X_train_smote, y_train_smote

X_train_smote, y_train_smote = get_smote_train(X_train, y_train)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [9]:
from sklearn.ensemble import RandomForestClassifier
model_rf_test = RandomForestClassifier(n_estimators=250, max_depth=15, criterion='gini', class_weight='balanced')
model_rf_test.fit(X_train_smote, y_train_smote)

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, model_rf_test.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      4373
           1       0.50      0.44      0.47      1003

    accuracy                           0.81      5376
   macro avg       0.69      0.67      0.68      5376
weighted avg       0.81      0.81      0.81      5376



In [11]:
# Shows how many users had 1 unique value, 2 conflicting values
df_modeling.groupby('user')['credits_700+'].nunique().value_counts()

credits_700+
1    6968
2    1741
Name: count, dtype: int64

In [12]:
# For each user, determine the most frequent value of the 'credits_700+' target label
# This handles users who appear multiple times with possibly conflicting labels (e.g., 0 and 1)
df_target = (
    df_modeling
    .groupby('user')['credits_700+']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
)

#Merge the aggregated target label with the current credit data
df_credits_with_target = pd.merge(df_credits, df_target, on='user', how='left')

In [13]:
# Filter only users who currently have more than 500 CZK in credits (they're eligible to withdraw) MAYBE CHANGED IT TO 450?? BASED ON HOW FAST THEY CAN INCREASE THEIR CREDITS
eligible_users = df_credits_with_target[df_credits_with_target['credits'] > 500].copy()

In [14]:
# Define who is *likely* to withdraw:
# - User has withdrawn before reaching 700 credits (i.e., usually withdraws between 500 and 699 CZK).
# - User typically waits until credits exceed 700 before withdrawing.
eligible_users['possible_withdraw'] = (
    ((eligible_users['credits_700+'] == 0) & (eligible_users['credits'] < 700)) |
    ((eligible_users['credits_700+'] == 1) & (eligible_users['credits'] > 700))
).astype(int)

In [15]:
# Filter users predicted to potentially withdraw soon
likely_to_withdraw = eligible_users[eligible_users['possible_withdraw'] == 1]

# Calculate the total credits they might withdraw
total_possible_withdrawal = likely_to_withdraw['credits'].sum()

print(f"💰 Estimated amount that could be withdrawn soon: {total_possible_withdrawal:,.0f} CZK")

💰 Estimated amount that could be withdrawn soon: 948,318 CZK
