In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostClassifier
from collections import defaultdict
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

In [2]:
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')

#Cleared discrepancies in the data
df_credits_cleaned = df_credits[df_credits['credits']>0]
df_payments_cleaned = df_payments[df_payments['user'].notna()]

grouped_cols = defaultdict(dict)

for col in df_atlas.columns:
    if "-" in col:
        group, key = col.split('-', 1)
        grouped_cols[group][key] = col
    else:
        grouped_cols[col][col] = col

structured_data = []
for _, row in df_atlas.iterrows():
    entry = {}
    for group, mapping in grouped_cols.items():
        entry[group] = [key for key, col in mapping.items() if row[col] == 1]
    structured_data.append(entry)

df_atlas_numeric_values = pd.DataFrame(structured_data)
df_atlas_numeric_values = df_atlas_numeric_values.drop(columns=['user_id'])

df_atlas_nv = pd.concat([df_atlas['user_id'], df_atlas_numeric_values], axis=1, join='inner')

mapping_dicts = {}

for col in df_atlas_nv.columns:
    if col == 'user_id':
        continue

    # Convert lists to tuples to make them hashable (dict keys)
    unique_lists = df_atlas_nv[col].apply(lambda x: tuple(sorted(x))).unique()
    
    # Create mapping: list → ID
    mapping_dicts[col] = {lst: idx + 1 for idx, lst in enumerate(unique_lists)}

    # Apply mapping
    df_atlas_nv[col] = df_atlas_nv[col].apply(lambda x: mapping_dicts[col][tuple(sorted(x))])

In [3]:
df_credits['user'] = df_credits['user'].astype(str)
df_payments['user'] = df_payments['user'].astype(str)
df_atlas['user_id'] = df_atlas['user_id'].astype(str)
df_atlas_nv['user_id'] = df_atlas_nv['user_id'].astype(str)

In [4]:
withdrawals = df_payments[
    (df_payments['credits'] >= 500) & 
    (df_payments['state'].isin(['PAID', 'APPROVED']))
]

withdrawal_stats = withdrawals.groupby('user').agg(
    num_withdrawals=('credits', 'count'),
    avg_withdrawal=('credits', 'mean'),
    total_withdrawn=('credits', 'sum')
).reset_index()

#Real-time users, which have above 500 credits (potentially can withdraw the money)
eligible_now = df_credits[df_credits['credits'] >= 500].copy()
eligible_now = eligible_now.merge(withdrawal_stats, on='user', how='left')

eligible_now[['num_withdrawals', 'avg_withdrawal', 'total_withdrawn']] = eligible_now[
    ['num_withdrawals', 'avg_withdrawal', 'total_withdrawn']
].fillna(0)

In [5]:
def assign_behavior(row):
    if row['num_withdrawals'] == 0:
        return 'new'
    elif row['num_withdrawals'] <= 2:
        return 'occasional'
    else:
        return 'regular'

eligible_now['withdrawal_segment'] = eligible_now.apply(assign_behavior, axis=1)
segment_mapping = {'new': 0, 'occasional': 1, 'regular': 2}
eligible_now['withdrawal_segment_code'] = eligible_now['withdrawal_segment'].map(segment_mapping)

eligible_now.drop(columns=['withdrawal_segment', 'wage'], inplace=True, errors='ignore')

In [6]:
def likely_to_withdraw_now(row):
    if row['num_withdrawals'] == 0:
        return 0
    expected_threshold = max(0.9 * row['avg_withdrawal'], 450)
    return int(row['credits'] >= expected_threshold)

eligible_now['target'] = eligible_now.apply(likely_to_withdraw_now, axis=1)

eligible_now = eligible_now.drop(columns=['wage'], errors='ignore')

In [7]:
# Prepare atlas structured features
# Simplify categorical encoding for now (flatten groups)
df_atlas_clean_nv = df_atlas_nv.copy()
df_atlas_clean = df_atlas.copy()

In [8]:
df_atlas_clean_nv.rename(columns={'user_id': 'user'}, inplace=True)
df_atlas_clean.rename(columns={'user_id': 'user'}, inplace=True)

MODEL WITHOUT JOINING THE COLUMNS

In [9]:
# Merge user features
df_model = pd.merge(eligible_now, df_atlas_clean, on='user', how='left')

In [11]:
# Prepare features
X = df_model.drop(columns=[
    'target', 'user', 'credits',
    'num_withdrawals', 'avg_withdrawal', 'total_withdrawn', 
    'is_active', 'is_verified', 'is_locked' #always 1
])
y = df_model['target']

X = X.fillna(0)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# Predict
y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()

# Evaluation
print(classification_report(y_test, y_pred, output_dict=False))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
              precision    recall  f1-score   support

           0       0.83      0.90      0.87       560
           1       0.47      0.32      0.38       151

    accuracy                           0.78       711
   macro avg       0.65      0.61      0.62       711
weighted avg       0.76      0.78      0.76       711



In [12]:
# Prepare prediction input
X_pred = df_model.drop(columns=['credits', 'target', 'user'], errors='ignore')
X_pred = X_pred.fillna(0)
X_pred = X_pred.reindex(columns=X.columns, fill_value=0)

# Scale using previously fitted scaler
X_pred_scaled = scaler.transform(X_pred)

# Predict cashout probability
df_model['prediction'] = (model.predict(X_pred_scaled) > 0.5).astype(int)

# Estimate total withdrawal amount
predicted_cashouts = df_model[df_model['prediction'] == 1]
total_predicted_credits = predicted_cashouts['credits'].sum()

print(f"💰 Estimated reserve needed for upcoming withdrawals: {total_predicted_credits:,.0f} CZK")

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
💰 Estimated reserve needed for upcoming withdrawals: 851,301 CZK


MODEL WITH JOINED COLUMNS

In [9]:
# Merge user features
df_model2 = pd.merge(eligible_now, df_atlas_clean_nv, on='user', how='left')

In [12]:
# Prepare features
X2 = df_model2.drop(columns=[
    'target', 'user', 'credits',
    'num_withdrawals', 'avg_withdrawal', 'total_withdrawn', 
    'is_active', 'is_verified', 'is_locked' #always 1
])
y2 = df_model2['target']

X2 = X2.fillna(0)

# Normalize
scaler2 = StandardScaler()
X_scaled2 = scaler2.fit_transform(X2)

# Train/test split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_scaled2, y2, stratify=y2, test_size=0.2, random_state=42)

# Build model
model2 = Sequential([
    Dense(64, activation='relu', input_shape=(X_train2.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model2.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train
history2 = model2.fit(X_train2, y_train2, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

# Predict
y_pred2 = (model2.predict(X_test2) > 0.33).astype(int).flatten()

# Evaluation
print(classification_report(y_test2, y_pred2, output_dict=False))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       560
           1       0.57      0.61      0.59       151

    accuracy                           0.82       711
   macro avg       0.73      0.74      0.74       711
weighted avg       0.82      0.82      0.82       711



In [43]:
# Prepare prediction input
X_pred = df_model2.drop(columns=['credits', 'target', 'user'], errors='ignore')
X_pred = X_pred.fillna(0)
X_pred = X_pred.reindex(columns=X2.columns, fill_value=0)

# Scale using previously fitted scaler
X_pred_scaled = scaler2.transform(X_pred)

# Predict cashout probability
df_model2['prediction'] = (model2.predict(X_pred_scaled) > 0.5).astype(int)

# Estimate total withdrawal amount
predicted_cashouts = df_model2[df_model2['prediction'] == 1]
total_predicted_credits = predicted_cashouts['credits'].sum()

print(f"💰 Estimated reserve needed for upcoming withdrawals: {total_predicted_credits:,.0f} CZK")

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
💰 Estimated reserve needed for upcoming withdrawals: 868,071 CZK


WITH JOINED COLUMNS THE MODEL PREDICTS BETTER, I'LL TRY SOME OTHER MODELS WITH SAME LOGIC

In [44]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=15, class_weight='balanced', random_state=42)
rf_model.fit(X_train2, y_train2)

y_pred_rf = rf_model.predict(X_test2)

print("Random Forest:\n")
print(classification_report(y_test2, y_pred_rf))

Random Forest:

              precision    recall  f1-score   support

           0       0.88      0.91      0.90       560
           1       0.63      0.54      0.58       151

    accuracy                           0.83       711
   macro avg       0.75      0.73      0.74       711
weighted avg       0.83      0.83      0.83       711



In [45]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=6, scale_pos_weight=3, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train2, y_train2)

y_pred_xgb = xgb_model.predict(X_test2)

print("XGBoost:\n")
print(classification_report(y_test2, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



XGBoost:

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       560
           1       0.62      0.56      0.59       151

    accuracy                           0.84       711
   macro avg       0.76      0.74      0.74       711
weighted avg       0.83      0.84      0.83       711



In [46]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_model.fit(X_train2, y_train2)

y_pred_lr = lr_model.predict(X_test2)

print("Logistic Regression:\n")
print(classification_report(y_test2, y_pred_lr))


Logistic Regression:

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       560
           1       0.60      0.62      0.61       151

    accuracy                           0.83       711
   macro avg       0.75      0.75      0.75       711
weighted avg       0.83      0.83      0.83       711



TRYING XGB AND TUNING OF THE MODEL

In [19]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tqdm import tqdm

# Create DMatrices
dtrain = xgb.DMatrix(X_train2, label=y_train2)
dvalid = xgb.DMatrix(X_test2, label=y_test2)

# Define search space
param_dist = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25],
    'max_depth': [6, 7, 8],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.9, 1.0],
    'scale_pos_weight': [1.0, 1.5, 2.0],
}

param_list = list(ParameterSampler(param_dist, n_iter=200, random_state=42))

# Custom F1 evaluation for class 1
def f1_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds_binary = (preds > 0.5).astype(int)
    return 'f1', f1_score(labels, preds_binary)

# Tracking best model
best_f1 = 0
best_threshold = 0.38
best_booster = None
best_params = None
best_report = ""
results = []

# Training loop
for params in tqdm(param_list, desc="Searching for best model"):
    full_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': 42,
        **params
    }

    booster = xgb.train(
        full_params,
        dtrain,
        num_boost_round=200,
        evals=[(dvalid, 'eval')],
        early_stopping_rounds=10,
        verbose_eval=False,
        feval=f1_eval
    )

    # Predict probabilities
    y_proba = booster.predict(dvalid)

    # Try multiple thresholds
    for thresh in np.linspace(0.3, 0.55, 30):
        y_pred = (y_proba > thresh).astype(int)
        f1 = f1_score(y_test2, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_threshold = thresh
            best_booster = booster
            best_params = full_params
            best_report = classification_report(y_test2, y_pred, output_dict=False)

# Print best results
print(f"\n✅ Best F1: {best_f1:.4f} at threshold={best_threshold:.2f}")
print("🔧 Best Params:", best_params)
print("\n📄 Best Report:\n", best_report)

Searching for best model: 100%|██████████| 200/200 [00:26<00:00,  7.54it/s]


✅ Best F1: 0.7282 at threshold=0.38
🔧 Best Params: {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42, 'subsample': 1.0, 'scale_pos_weight': 2.0, 'max_depth': 6, 'learning_rate': 0.25, 'colsample_bytree': 0.9}

📄 Best Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89       560
           1       0.57      0.99      0.73       151

    accuracy                           0.84       711
   macro avg       0.79      0.90      0.81       711
weighted avg       0.91      0.84      0.85       711






In [14]:
# Prepare prediction input
X_pred = df_model2.drop(columns=['credits', 'target', 'user'], errors='ignore')
X_pred = X_pred.fillna(0)

dpred = xgb.DMatrix(X_pred)

# Predict using best threshold
y_pred_proba_final = booster.predict(dpred)
df_model2['prediction'] = (y_pred_proba_final > best_threshold).astype(int)

# Estimate total withdrawal amount
predicted_cashouts = df_model2[df_model2['prediction'] == 1]
total_predicted_credits = predicted_cashouts['credits'].sum()

print(f"\n💰 Estimated reserve needed for upcoming withdrawals: {total_predicted_credits:,.0f} CZK")


XGBoostError: [12:55:08] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:1462: Check failed: learner_model_param_.num_feature >= p_fmat->Info().num_col_ (231 vs. 237) : Number of columns does not match number of features in booster.