In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostClassifier

In [2]:
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')

In [3]:
len(df_atlas.columns)

1121

In [4]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30087 entries, 0 to 30086
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          30086 non-null  float64
 1   created_at  30087 non-null  object 
 2   changed_at  30087 non-null  object 
 3   user        27652 non-null  object 
 4   batch       27502 non-null  object 
 5   credits     30086 non-null  float64
 6   state       30086 non-null  object 
 7   type        30086 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.8+ MB


# Data Cleaning

In [5]:
df_payments_cleaned = df_payments[df_payments['id'].notna()]

# What is core table for merge? - Payments // Can't I just use inner join? Therefore I dont' have to work with nul values - I don't want them in model anyway

In [6]:
atlas_in_payments = df_payments_cleaned.user.isin(df_atlas.user_id)

# Plotting
fig_ap = px.bar(atlas_in_payments.value_counts(), title='How many users from Atlas Cechu are in the payments table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})
fig_ap.show()

In [7]:
df_atlas.user_id.isin(df_payments_cleaned.user).value_counts()

user_id
False    34824
True      8709
Name: count, dtype: int64

Since we want to find out when who pulls what, it makes sense to use the payments table and connect data to it - even though there are users in payments tables that are not in Atlas Cechu

In [9]:
#df_merge_credits = pd.merge(df_payments_cleaned, df_credits_cleaned, how="inner", left_on='user', right_on='user')
#df_merge_credits[df_merge_credits.user=='STUD92308']

"""
I don't find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions
"""

'\nI don\'t find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions\n'

In [8]:
df_merge_full = pd.merge(df_payments_cleaned, df_atlas, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})

df_merge_full = df_merge_full[df_merge_full['state']!='CANCELLED']

# df_merge_full.to_csv('../data_output/df_merge_full.csv')

In [9]:
##### In the dataset we found out that in payments dataset values that are being cancelled has creadits = 0 and NaN user having credit 430 and still being PAID. However, that user doesn't have user_id

df_payments[df_payments['credits']<500].groupby('state').size()

state
CANCELLED    2000
PAID            1
dtype: int64

In [10]:
df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])

df_merge_full['day'] = df_merge_full['created_at'].dt.day
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year
df_merge_full['hour'] = df_merge_full['created_at'].dt.hour
df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday


df_merge_full = df_merge_full.drop(columns=['created_at'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [11]:
df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user','batch','state','user_id'])
df_modeling['credits_category'] = pd.cut(df_modeling['credits'], bins=[500, 600, 1000, max(df_modeling['credits'])], labels=['500-600', '601-1000', '1001+'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [13]:
df_modeling = df_modeling[df_modeling['credits_category'].notnull()]

In [14]:
df_modeling['credits_category'].unique()

['500-600', '601-1000', '1001+']
Categories (3, object): ['500-600' < '601-1000' < '1001+']

In [21]:
df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user','batch','state','user_id'])
df_modeling['credits_category'] = pd.cut(df_modeling['credits'], bins=[500, 600, 1000, max(df_modeling['credits'])], labels=['500-600', '601-1000', '1001+'])
df_modeling = df_modeling.drop(columns=['credits'])

fig = px.bar(pd.DataFrame(df_modeling['credits_category'].value_counts().reset_index()), x='credits_category', y='count', title='Credits Category Distribution')
fig.show()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



# Modeling

In [32]:
df_cleaned = df_modeling[df_modeling['credits_category'].notnull()]

y = df_cleaned['credits_category']
X = df_cleaned.drop(columns=['credits_category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [33]:
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    verbose=50
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

0:	learn: 0.7330660	test: 0.7311181	best: 0.7311181 (0)	total: 39.7ms	remaining: 19.8s
50:	learn: 0.7328550	test: 0.7319620	best: 0.7319620 (1)	total: 2.43s	remaining: 21.4s
100:	learn: 0.7404516	test: 0.7331751	best: 0.7331751 (98)	total: 4.88s	remaining: 19.3s
150:	learn: 0.7600760	test: 0.7340717	best: 0.7341772 (143)	total: 7.37s	remaining: 17s
200:	learn: 0.7837096	test: 0.7362869	best: 0.7362869 (200)	total: 9.96s	remaining: 14.8s
250:	learn: 0.8090314	test: 0.7368143	best: 0.7371308 (245)	total: 12.8s	remaining: 12.7s
300:	learn: 0.8339312	test: 0.7364451	best: 0.7376055 (286)	total: 15.8s	remaining: 10.4s
350:	learn: 0.8596750	test: 0.7359705	best: 0.7376055 (286)	total: 18.8s	remaining: 7.96s
400:	learn: 0.8818316	test: 0.7363397	best: 0.7376055 (286)	total: 21.5s	remaining: 5.31s
450:	learn: 0.9035662	test: 0.7348101	best: 0.7376055 (286)	total: 24.2s	remaining: 2.63s
499:	learn: 0.9198143	test: 0.7338608	best: 0.7376055 (286)	total: 26.8s	remaining: 0us

bestTest = 0.7376054

<catboost.core.CatBoostClassifier at 0x240f0a09130>

In [130]:
# Get importance values
importances = model.get_feature_importance()
feature_names = X_train.columns

# Combine into DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

fig = px.bar(feature_importance_df.head(10), x='Importance', y='Feature', title='Top 20 Feature Importances')
fig.show()

## PCA to reduce attributes - After using PCA it won't be interpretable as leaving as it is

In [122]:
# This doesn't work anymore - from the consultation with VS it is bad idea to do PCA as it lose the interpretability

"""from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

X = df_merge_full.drop(columns=['id', 'user', 'batch', 'state', 'user_id', 'changed_at', 'created_at'])

# PCA is sensitive to scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95) # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

print(f"Original shape: {X.shape}")
print(f"Reduced shape: {X_pca.shape}")"""

'from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\n\nX = df_merge_full.drop(columns=[\'id\', \'user\', \'batch\', \'state\', \'user_id\', \'changed_at\', \'created_at\'])\n\n# PCA is sensitive to scaling\nscaler = StandardScaler()\nX_scaled = scaler.fit_transform(X)\n\npca = PCA(n_components=0.95) # Retain 95% of variance\nX_pca = pca.fit_transform(X_scaled)\n\nprint(f"Original shape: {X.shape}")\nprint(f"Reduced shape: {X_pca.shape}")'

In [140]:
df_atlas

Unnamed: 0,user_id,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,all_intro-trendaro_panel_all,sex-woman,sex-man,age-15_17,age-18_24,age-25_34,...,check-tv,check-radio,check-household,check-overdraft,check-naramek,isic2_age-21_26,isic2_age-15_20,isic_age-15_17,isic_age-18_22,isic_age-23_26
0,STUD29866,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
1,STUD35256,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,STUD29684,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,STUD11967,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,STUD57644,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43528,STUD33520,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43529,STUD64071,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43530,STUD82916,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43531,STUD63130,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
y_train.value_counts(normalize=True)

credits_category
500-600     0.720364
1001+       0.148180
601-1000    0.131456
Name: proportion, dtype: float64

In [19]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,year,1.561861
1,day,1.553487
2,weekday,1.497899
3,hour,1.183224
4,investments-retirement_savings,1.032167
...,...,...
1124,isic2_age-15_20,0.000000
1125,isic_age-15_17,0.000000
1126,isic_age-23_26,0.000000
1127,type_HANDIPET,0.000000


In [20]:
print("NaNs in X:", X.isnull().sum().sum())
print("NaNs in y:", y.isnull().sum())

NaNs in X: 0
NaNs in y: 1488


In [28]:
df_modeling[df_modeling['credits_category'].isna()]['credits']

KeyError: 'credits'

In [24]:
y = df_modeling['credits_category']
X = df_modeling.drop(columns=['credits_category'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, stratify=y, random_state=42
)

ValueError: Input contains NaN