In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostClassifier
from collections import defaultdict
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

In [2]:
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')

#Cleared discrepancies in the data
df_credits_cleaned = df_credits[df_credits['credits']>0]
df_payments_cleaned = df_payments[df_payments['user'].notna()]

grouped_cols = defaultdict(dict)

for col in df_atlas.columns:
    if "-" in col:
        group, key = col.split('-', 1)
        grouped_cols[group][key] = col
    else:
        grouped_cols[col][col] = col

structured_data = []
for _, row in df_atlas.iterrows():
    entry = {}
    for group, mapping in grouped_cols.items():
        entry[group] = [key for key, col in mapping.items() if row[col] == 1]
    structured_data.append(entry)

df_atlas_numeric_values = pd.DataFrame(structured_data)
df_atlas_numeric_values = df_atlas_numeric_values.drop(columns=['user_id'])

df_atlas_nv = pd.concat([df_atlas['user_id'], df_atlas_numeric_values], axis=1, join='inner')

mapping_dicts = {}

for col in df_atlas_nv.columns:
    if col == 'user_id':
        continue

    # Convert lists to tuples to make them hashable (dict keys)
    unique_lists = df_atlas_nv[col].apply(lambda x: tuple(sorted(x))).unique()
    
    # Create mapping: list → ID
    mapping_dicts[col] = {lst: idx + 1 for idx, lst in enumerate(unique_lists)}

    # Apply mapping
    df_atlas_nv[col] = df_atlas_nv[col].apply(lambda x: mapping_dicts[col][tuple(sorted(x))])

In [3]:
df_credits['user'] = df_credits['user'].astype(str)
df_payments['user'] = df_payments['user'].astype(str)
df_atlas_nv['user_id'] = df_atlas_nv['user_id'].astype(str)

#Unique users, which withdrawn money in the past
positive_users = df_payments[
    (df_payments['credits'] >= 500) &
    (df_payments['state'].isin(['PAID', 'APPROVED']))
]['user'].unique()

#Real-time users, which have above 500 credits (potentially can withdraw the money)
eligible_now = df_credits[df_credits['credits'] >= 500].copy()

#Users who have withdrawn money in the past and are eligible now (filtered wage Trendaro (I think that means that they donate money to Trendaro))
eligible_now['target'] = (
    eligible_now['user'].isin(positive_users) &
    (eligible_now['wage'] != 'Trendaro')
).astype(int)

In [4]:
sum(eligible_now[eligible_now['target'] == 1]['credits'])

1410312

In [5]:
withdrawals = df_payments[
    (df_payments['credits'] >= 500) & 
    (df_payments['state'].isin(['PAID', 'APPROVED']))
]

withdrawal_stats = withdrawals.groupby('user').agg(
    num_withdrawals=('credits', 'count'),
    avg_withdrawal=('credits', 'mean'),
    total_withdrawn=('credits', 'sum')
).reset_index()

# Merge statistics into eligible_now
eligible_now = eligible_now.merge(withdrawal_stats, on='user', how='left')
eligible_now[['num_withdrawals', 'avg_withdrawal', 'total_withdrawn']] = eligible_now[
    ['num_withdrawals', 'avg_withdrawal', 'total_withdrawn']
].fillna(0)

# Behavioral segmentation
def assign_behavior(row):
    if row['num_withdrawals'] == 0:
        return 'new'
    elif row['num_withdrawals'] <= 2:
        return 'occasional'
    else:
        return 'regular'

eligible_now['withdrawal_segment'] = eligible_now.apply(assign_behavior, axis=1)

segment_mapping = {
    'new': 0,
    'occasional': 1,
    'regular': 2
}
eligible_now['withdrawal_segment_code'] = eligible_now['withdrawal_segment'].map(segment_mapping)

eligible_now.drop(columns='withdrawal_segment', inplace=True)

In [6]:
eligible_now = eligible_now.drop(columns=['wage'], errors='ignore')

In [7]:
# Prepare atlas structured features
# Simplify categorical encoding for now (flatten groups)
df_atlas_clean = df_atlas_nv.copy()

In [8]:
df_atlas_clean

Unnamed: 0,user_id,all_intro,atlas_vzorek,sex,age,education_level,city_size,kraj,income_level,ea,...,sportshops_halfyear,sport_halfyear,klientstvi,intro,seen,kecalkove,nakupuje,check,isic2_age,isic_age
0,STUD29866,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,STUD35256,2,1,2,2,2,2,2,1,1,...,1,1,1,1,2,1,1,1,2,2
2,STUD29684,1,1,3,3,3,1,3,1,1,...,1,1,1,1,1,1,1,1,2,2
3,STUD11967,1,1,1,4,4,1,1,1,1,...,1,1,1,1,3,1,1,1,3,3
4,STUD57644,1,1,3,5,1,1,3,1,1,...,1,1,1,1,1,1,1,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43528,STUD33520,1,1,1,5,4,1,5,2,1,...,1,1,1,1,344,1,1,1,2,2
43529,STUD64071,1,1,3,9,3,1,5,1,1,...,1,1,1,1,2,1,1,1,2,2
43530,STUD82916,3,4,3,5,13,4,12,3,19,...,2,1,2,1,20,1,2,1,2,2
43531,STUD63130,2,1,2,2,5,1,5,1,57,...,1,1,1,1,2,1,1,1,2,2


In [9]:
df_atlas_clean.rename(columns={'user_id': 'user'}, inplace=True)

In [10]:
# Merge user features
df_model = pd.merge(eligible_now, df_atlas_clean, on='user', how='left')

In [12]:
# Prepare features
X = df_model.drop(columns=[
    'target', 'user', 'credits',
    'num_withdrawals', 'avg_withdrawal', 'total_withdrawn', 
    'is_active', 'is_verified', 'is_locked' #always 1
])
y = df_model['target']

X = X.fillna(0)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

# Predict
y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()

# Evaluation
report = classification_report(y_test, y_pred, output_dict=False)
report


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


'              precision    recall  f1-score   support\n\n           0       0.95      0.99      0.97       450\n           1       0.98      0.91      0.94       261\n\n    accuracy                           0.96       711\n   macro avg       0.96      0.95      0.96       711\nweighted avg       0.96      0.96      0.96       711\n'

In [13]:
# Prepare prediction input
X_pred = df_model.drop(columns=['credits', 'target', 'user'], errors='ignore')
X_pred = X_pred.fillna(0)
X_pred = X_pred.reindex(columns=X.columns, fill_value=0)

# Scale using previously fitted scaler
X_pred_scaled = scaler.transform(X_pred)

# Predict cashout probability
df_model['prediction'] = (model.predict(X_pred_scaled) > 0.5).astype(int)

# Estimate total withdrawal amount
predicted_cashouts = df_model[df_model['prediction'] == 1]
total_predicted_credits = predicted_cashouts['credits'].sum()

print(f"💰 Estimated reserve needed for upcoming withdrawals: {total_predicted_credits:,.0f} CZK")

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
💰 Estimated reserve needed for upcoming withdrawals: 1,378,974 CZK
