In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostClassifier

In [7]:
df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')
df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')

In [3]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30087 entries, 0 to 30086
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          30086 non-null  float64
 1   created_at  30087 non-null  object 
 2   changed_at  30087 non-null  object 
 3   user        27652 non-null  object 
 4   batch       27502 non-null  object 
 5   credits     30086 non-null  float64
 6   state       30086 non-null  object 
 7   type        30086 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.8+ MB


In [8]:
df_credits_cleaned = df_credits[df_credits['credits']>0]
df_payments_cleaned = df_payments[df_payments['user'].notna()]

In [5]:
list(df_atlas.columns)

['user_id',
 'all_intro-2022',
 'atlas_vzorek-2023',
 'atlas_vzorek-2024',
 'all_intro-trendaro_panel_all',
 'sex-woman',
 'sex-man',
 'age-15_17',
 'age-18_24',
 'age-25_34',
 'age-35_44',
 'age-45_54',
 'age-55_64',
 'age-65_plus',
 'education_level-basic',
 'education_level-high_bezmat',
 'education_level-high_smat',
 'education_level-university',
 'city_size-0_2',
 'city_size-2_10',
 'city_size-10_50',
 'city_size-over50',
 'kraj-stredocesky',
 'kraj-jihocesky',
 'kraj-plzensky',
 'kraj-karlovarsky',
 'kraj-ustecky',
 'kraj-liberecky',
 'kraj-kralovehradecky',
 'kraj-pardubicky',
 'kraj-olomoucky',
 'kraj-moravskoslezsky',
 'kraj-jihomoravsky',
 'kraj-zlinsky',
 'kraj-vysocina',
 'income_level-high',
 'income_level-mid',
 'income_level-low',
 'ea-student',
 'ea-maternity_leave',
 'ea-unemployed',
 'ea-pensioner',
 'ea-fulltime',
 'elected_2021-no',
 'elected_2021-ano_2011',
 'elected_2021-spolu',
 'elected_2021-pirstan',
 'elected_2021-spd',
 'nuts2-praha',
 'nuts2-stredocesky',
 '

In [9]:
from collections import defaultdict

grouped_cols = defaultdict(dict)

for col in df_atlas.columns:
    if "-" in col:
        group, key = col.split('-', 1)
        grouped_cols[group][key] = col
    else:
        grouped_cols[col][col] = col

structured_data = []
for _, row in df_atlas.iterrows():
    entry = {}
    for group, mapping in grouped_cols.items():
        entry[group] = [key for key, col in mapping.items() if row[col] == 1]
    structured_data.append(entry)

In [10]:
df_test = pd.DataFrame(structured_data)

In [11]:
df_test

Unnamed: 0,user_id,all_intro,atlas_vzorek,sex,age,education_level,city_size,kraj,income_level,ea,...,sportshops_halfyear,sport_halfyear,klientstvi,intro,seen,kecalkove,nakupuje,check,isic2_age,isic_age
0,[],[trendaro_panel_all],[],[woman],[15_17],[basic],[over50],[vysocina],[],[],...,[],[],[],[],[o_mne],[],[],[],[15_20],[15_17]
1,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2,[],[trendaro_panel_all],[],[man],"[55_64, 45_64]",[high_bezmat],[over50],[plzensky],[],[],...,[],[],[],[],[o_mne],[],[],[],[],[]
3,[],[trendaro_panel_all],[],[woman],"[25_34, 18_29]","[university, university_mgr]",[over50],[vysocina],[],[],...,[],[],[],[],"[o_mne, nazory, media]",[],[],[],[21_26],[23_26]
4,[],[trendaro_panel_all],[],[man],"[35_44, 30_44]",[basic],[over50],[plzensky],[],[],...,[],[],[],[],[o_mne],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43528,[],[trendaro_panel_all],[],[woman],"[35_44, 30_44]","[university, university_mgr]",[over50],"[hlavni, praha]",[low],[],...,[],[],[],[],[cestovani],[],[],[],[],[]
43529,[],[trendaro_panel_all],[],[man],[65_plus],[high_bezmat],[over50],"[hlavni, praha]",[],[],...,[],[],[],[],[],[],[],[],[],[]
43530,[],"[2022, trendaro_panel_all]","[2023, 2024]",[man],"[35_44, 30_44]","[university, university_phd]",[0_2],[jihocesky],[mid],"[fulltime, osvc_own_firm, own_firm]",...,[decathlon],[],[eon],[],"[znacky, nakupovani, technologie, o_mne, u_me_...",[],[partner],[],[],[]
43531,[],[],[],[],[],[high_smat],[over50],"[hlavni, praha]",[],"[osvc_own_firm, own_firm]",...,[],[],[],[],[],[],[],[],[],[]


In [13]:
df_test = df_test.drop(columns=['user_id'])

In [14]:
df_test3 = pd.concat([df_atlas['user_id'], df_test], axis=1, join='inner')

In [15]:
df_test3

Unnamed: 0,user_id,all_intro,atlas_vzorek,sex,age,education_level,city_size,kraj,income_level,ea,...,sportshops_halfyear,sport_halfyear,klientstvi,intro,seen,kecalkove,nakupuje,check,isic2_age,isic_age
0,STUD29866,[trendaro_panel_all],[],[woman],[15_17],[basic],[over50],[vysocina],[],[],...,[],[],[],[],[o_mne],[],[],[],[15_20],[15_17]
1,STUD35256,[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2,STUD29684,[trendaro_panel_all],[],[man],"[55_64, 45_64]",[high_bezmat],[over50],[plzensky],[],[],...,[],[],[],[],[o_mne],[],[],[],[],[]
3,STUD11967,[trendaro_panel_all],[],[woman],"[25_34, 18_29]","[university, university_mgr]",[over50],[vysocina],[],[],...,[],[],[],[],"[o_mne, nazory, media]",[],[],[],[21_26],[23_26]
4,STUD57644,[trendaro_panel_all],[],[man],"[35_44, 30_44]",[basic],[over50],[plzensky],[],[],...,[],[],[],[],[o_mne],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43528,STUD33520,[trendaro_panel_all],[],[woman],"[35_44, 30_44]","[university, university_mgr]",[over50],"[hlavni, praha]",[low],[],...,[],[],[],[],[cestovani],[],[],[],[],[]
43529,STUD64071,[trendaro_panel_all],[],[man],[65_plus],[high_bezmat],[over50],"[hlavni, praha]",[],[],...,[],[],[],[],[],[],[],[],[],[]
43530,STUD82916,"[2022, trendaro_panel_all]","[2023, 2024]",[man],"[35_44, 30_44]","[university, university_phd]",[0_2],[jihocesky],[mid],"[fulltime, osvc_own_firm, own_firm]",...,[decathlon],[],[eon],[],"[znacky, nakupovani, technologie, o_mne, u_me_...",[],[partner],[],[],[]
43531,STUD63130,[],[],[],[],[high_smat],[over50],"[hlavni, praha]",[],"[osvc_own_firm, own_firm]",...,[],[],[],[],[],[],[],[],[],[]


In [16]:
mapping_dicts = {}

for col in df_test3.columns:
    if col == 'user_id':
        continue

    # Convert lists to tuples to make them hashable (dict keys)
    unique_lists = df_test3[col].apply(lambda x: tuple(sorted(x))).unique()
    
    # Create mapping: list → ID
    mapping_dicts[col] = {lst: idx + 1 for idx, lst in enumerate(unique_lists)}

    # Apply mapping
    df_test3[col] = df_test3[col].apply(lambda x: mapping_dicts[col][tuple(sorted(x))])


In [17]:
df_test3

Unnamed: 0,user_id,all_intro,atlas_vzorek,sex,age,education_level,city_size,kraj,income_level,ea,...,sportshops_halfyear,sport_halfyear,klientstvi,intro,seen,kecalkove,nakupuje,check,isic2_age,isic_age
0,STUD29866,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,STUD35256,2,1,2,2,2,2,2,1,1,...,1,1,1,1,2,1,1,1,2,2
2,STUD29684,1,1,3,3,3,1,3,1,1,...,1,1,1,1,1,1,1,1,2,2
3,STUD11967,1,1,1,4,4,1,1,1,1,...,1,1,1,1,3,1,1,1,3,3
4,STUD57644,1,1,3,5,1,1,3,1,1,...,1,1,1,1,1,1,1,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43528,STUD33520,1,1,1,5,4,1,5,2,1,...,1,1,1,1,344,1,1,1,2,2
43529,STUD64071,1,1,3,9,3,1,5,1,1,...,1,1,1,1,2,1,1,1,2,2
43530,STUD82916,3,4,3,5,13,4,12,3,19,...,2,1,2,1,20,1,2,1,2,2
43531,STUD63130,2,1,2,2,5,1,5,1,57,...,1,1,1,1,2,1,1,1,2,2


In [18]:
df_merge_full = pd.merge(df_payments_cleaned, df_test3, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})
df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year

  df_merge_full['month'] = df_merge_full['created_at'].dt.month
  df_merge_full['year'] = df_merge_full['created_at'].dt.year


In [19]:
df_merge_ml = df_merge_full.drop(columns=['created_at'])

In [20]:
def get_split_values(df_merge_full, value:int):
    col_name = f'credits_{value}+'
    df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user', 'batch', 'state', 'user_id'])
    df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)
    df_modeling = df_modeling.drop(columns=['credits'])

    fig = px.bar(pd.DataFrame(df_modeling[col_name].value_counts().reset_index()), x=col_name, y='count', title='Credits Category Distribution')
    fig.show()
    
    df_modeling = df_modeling[df_modeling[col_name].isnull()==False] # clean data from null values
    return df_modeling, col_name

df_modeling, colname = get_split_values(df_merge_ml, 660)

  df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)


In [22]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

y = df_modeling[colname]
X = df_modeling.drop(columns=[colname])

def split_data(X, y, test_size=0.2, stratify=True, random_state=42):
    """
    Split data into training and testing sets with optional stratification.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data to use as test set.
        stratify (bool): Whether to stratify split based on y.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple: (X_train, X_test, y_train, y_test)s
    """
    return train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y if stratify else None,
        random_state=random_state
    )

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y , train_size=0.8, random_state=42)

def get_smote_train(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    return X_train_smote, y_train_smote

X_train_smote, y_train_smote = get_smote_train(X_train, y_train)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [24]:
from sklearn.ensemble import RandomForestClassifier
model_rf_test = RandomForestClassifier(n_estimators=250, max_depth=15, criterion='gini', class_weight='balanced')
model_rf_test.fit(X_train_smote, y_train_smote)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, model_rf_test.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88      4297
           1       0.53      0.45      0.49      1079

    accuracy                           0.81      5376
   macro avg       0.70      0.68      0.69      5376
weighted avg       0.80      0.81      0.80      5376



# What is core table for merge? - Payments // Can't I just use inner join? Therefore I dont' have to work with nul values - I don't want them in model anyway

In [110]:
atlas_in_payments = df_payments_cleaned.user.isin(df_atlas.user_id)

# Plotting
fig_ap = px.bar(atlas_in_payments.value_counts(), title='How many users from Atlas Cechu are in the payments table', labels={'index': 'User in Atlas (True/False)', 'value': 'Count'})
fig_ap.show()

In [111]:
df_test3.user_id.isin(df_payments_cleaned.user).value_counts()

user_id
False    34824
True      8709
Name: count, dtype: int64

Since we want to find out when who pulls what, it makes sense to use the payments table and connect data to it - even though there are users in payments tables that are not in Atlas Cechu

In [61]:
#df_merge_credits = pd.merge(df_payments_cleaned, df_credits_cleaned, how="inner", left_on='user', right_on='user')
#df_merge_credits[df_merge_credits.user=='STUD92308']

"""
I don't find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions
"""

'\nI don\'t find credits table to be interesting - mainly we cannot join them together due to different "time type" -- credits shows the newest version of creadits without historical changes, on the other hand table payments shows history of all transactions\n'

In [112]:
df_merge_full = pd.merge(df_payments_cleaned, df_test3, how='inner', left_on='user', right_on='user_id')
df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})

df_merge_full = df_merge_full[df_merge_full['state']!='CANCELLED']

# df_merge_full.to_csv('../data_output/df_merge_full.csv')

In [113]:
##### In the dataset we found out that in payments dataset values that are being cancelled has creadits = 0 and NaN user having credit 430 and still being PAID. However, that user doesn't have user_id

df_payments[df_payments['credits']<500].groupby('state').size()

state
CANCELLED    2000
PAID            1
dtype: int64

In [114]:
df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])

df_merge_full['day'] = df_merge_full['created_at'].dt.day
df_merge_full['month'] = df_merge_full['created_at'].dt.month
df_merge_full['year'] = df_merge_full['created_at'].dt.year
df_merge_full['hour'] = df_merge_full['created_at'].dt.hour
df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday


df_merge_full = df_merge_full.drop(columns=['created_at'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [116]:
df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user','batch','state','user_id'])
df_modeling['credits_category'] = pd.cut(df_modeling['credits'], bins=[500,525,550,575,600, 1000, max(df_modeling['credits'])], labels=['500-525','526-550', '551-575', '576-600', '601-1000', '1001+'])


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [117]:
df_modeling = df_modeling[df_modeling['credits_category'].notnull()]

In [118]:
df_modeling['credits_category'].unique()

['500-525', '526-550', '601-1000', '551-575', '1001+', '576-600']
Categories (6, object): ['500-525' < '526-550' < '551-575' < '576-600' < '601-1000' < '1001+']

In [119]:
df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user','batch','state','user_id'])
df_modeling['credits_category'] = pd.cut(df_modeling['credits'], bins=[500,525,550,575,600, 1000, max(df_modeling['credits'])], labels=['500-525','526-550', '551-575', '576-600', '601-1000', '1001+'])
df_modeling = df_modeling.drop(columns=['credits'])

fig = px.bar(pd.DataFrame(df_modeling['credits_category'].value_counts().reset_index()), x='credits_category', y='count', title='Credits Category Distribution')
fig.show()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



# Modeling

In [120]:
df_cleaned = df_modeling[df_modeling['credits_category'].notnull()]

y = df_cleaned['credits_category']
X = df_cleaned.drop(columns=['credits_category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [141]:
for col in df_cleaned.columns:
    if df_cleaned[col].apply(lambda x: isinstance(x, list)).any():
        df_cleaned[col] = df_cleaned[col].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else x)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [144]:
y = df_cleaned['credits_category']
X = df_cleaned.drop(columns=['credits_category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [147]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    verbose=1
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="2022 trendaro_panel_all": Cannot convert '2022 trendaro_panel_all' to float

In [73]:
# Get importance values
importances = model.get_feature_importance()
feature_names = X_train.columns

In [76]:
# Combine into DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

In [80]:
feature_importance_df.head(100)['Feature'].to_list()

['insurance',
 'atlas_vzorek',
 'hour',
 'leisure_activity',
 'day',
 'co_fre_online',
 'weekday',
 'year',
 'news_server',
 'music',
 'cigarettes',
 'internet_tools',
 'obchody_online',
 'tv_station',
 'tv_shows',
 'investments',
 'limo',
 'shopping_monthly',
 'hobby_halfyear',
 'information_source',
 'grocery_quality',
 'plan',
 'month',
 'nuts2',
 'job',
 'shopping_weekly',
 'friends',
 'sportshops_halfyear',
 'foreign_language',
 'supermarket_monthly',
 'theatre',
 'internet',
 'soc_kapital',
 'beer',
 'immigrants_new',
 'notebook',
 'drugstore',
 'repairing',
 'brand',
 'leisure_activity_monthly_1',
 'pharmacy',
 'pride_2',
 'mobile',
 'tv_station_winners',
 'education_level',
 'kraj',
 'elected_2021',
 'pride_1',
 'kvalifikace',
 'freq_tv_hours',
 'journal',
 'acounts',
 'eu_vyhody',
 'shopping_quarterly',
 'driving_licence',
 'news_server_winners',
 'majetek',
 'chocolate',
 'charity',
 'holiday_abroad',
 'members_household',
 'company_size',
 'income_situation_new',
 'kids_fulf

In [104]:
df_adsasd= df_modeling[df_modeling['credits_category'].notnull()]
cols = feature_importance_df.head(25)['Feature'].to_list()
cols.append('credits_category')
df_cleaned2 = df_adsasd[cols]

y = df_cleaned2['credits_category']
X = df_cleaned2.drop(columns=['credits_category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [106]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClassOneVsAll',
    eval_metric='Accuracy',
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

0:	learn: 0.7332771	test: 0.7302743	best: 0.7302743 (0)	total: 13.2ms	remaining: 13.2s
100:	learn: 0.7328550	test: 0.7319620	best: 0.7319620 (1)	total: 996ms	remaining: 8.87s
200:	learn: 0.7385524	test: 0.7324367	best: 0.7324895 (194)	total: 1.84s	remaining: 7.32s
300:	learn: 0.7476261	test: 0.7333861	best: 0.7334916 (299)	total: 2.67s	remaining: 6.21s
400:	learn: 0.7636632	test: 0.7333333	best: 0.7340717 (345)	total: 3.52s	remaining: 5.25s
500:	learn: 0.7773792	test: 0.7335443	best: 0.7340717 (345)	total: 4.35s	remaining: 4.33s
600:	learn: 0.7936273	test: 0.7335443	best: 0.7340717 (345)	total: 5.15s	remaining: 3.42s
700:	learn: 0.8069213	test: 0.7345992	best: 0.7347574 (690)	total: 5.96s	remaining: 2.54s
800:	learn: 0.8216923	test: 0.7340190	best: 0.7347574 (690)	total: 6.75s	remaining: 1.68s
900:	learn: 0.8400506	test: 0.7344409	best: 0.7347574 (690)	total: 7.57s	remaining: 832ms
999:	learn: 0.8522895	test: 0.7339662	best: 0.7347574 (690)	total: 8.64s	remaining: 0us

bestTest = 0.734

<catboost.core.CatBoostClassifier at 0x2b8efdc1150>

In [103]:
X_train.nunique()

insurance            846
atlas_vzorek           4
hour                  24
leisure_activity    2717
day                   31
co_fre_online       1900
weekday                7
year                   9
news_server         1222
music                753
dtype: int64

In [72]:
# Get importance values
importances = model.get_feature_importance()
feature_names = X_train.columns

# Combine into DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

fig = px.bar(feature_importance_df.head(20), x='Importance', y='Feature', title='Top 20 Feature Importances')
fig.show()

## PCA to reduce attributes - After using PCA it won't be interpretable as leaving as it is

In [19]:
# This doesn't work anymore - from the consultation with VS it is bad idea to do PCA as it lose the interpretability

"""from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

X = df_merge_full.drop(columns=['id', 'user', 'batch', 'state', 'user_id', 'changed_at', 'created_at'])

# PCA is sensitive to scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95) # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

print(f"Original shape: {X.shape}")
print(f"Reduced shape: {X_pca.shape}")"""

'from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\n\nX = df_merge_full.drop(columns=[\'id\', \'user\', \'batch\', \'state\', \'user_id\', \'changed_at\', \'created_at\'])\n\n# PCA is sensitive to scaling\nscaler = StandardScaler()\nX_scaled = scaler.fit_transform(X)\n\npca = PCA(n_components=0.95) # Retain 95% of variance\nX_pca = pca.fit_transform(X_scaled)\n\nprint(f"Original shape: {X.shape}")\nprint(f"Reduced shape: {X_pca.shape}")'

In [20]:
df_atlas

Unnamed: 0,user_id,all_intro-2022,atlas_vzorek-2023,atlas_vzorek-2024,all_intro-trendaro_panel_all,sex-woman,sex-man,age-15_17,age-18_24,age-25_34,...,check-tv,check-radio,check-household,check-overdraft,check-naramek,isic2_age-21_26,isic2_age-15_20,isic_age-15_17,isic_age-18_22,isic_age-23_26
0,STUD29866,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
1,STUD35256,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,STUD29684,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,STUD11967,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,STUD57644,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43528,STUD33520,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43529,STUD64071,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43530,STUD82916,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43531,STUD63130,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
y_train.value_counts(normalize=True)

credits_category
500-600     0.732855
1001+       0.150243
601-1000    0.116902
Name: proportion, dtype: float64

In [22]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,year,2.239560
1,day,1.690253
2,hour,1.690020
3,weekday,1.639032
4,month,1.310483
...,...,...
1124,isic2_age-21_26,0.000000
1125,isic2_age-15_20,0.000000
1126,isic_age-15_17,0.000000
1127,isic_age-18_22,0.000000


In [23]:
print("NaNs in X:", X.isnull().sum().sum())
print("NaNs in y:", y.isnull().sum())

NaNs in X: 0
NaNs in y: 0


In [24]:
df_modeling[df_modeling['credits_category'].isna()]['credits']

KeyError: 'credits'

In [None]:
y = df_modeling['credits_category']
X = df_modeling.drop(columns=['credits_category'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, stratify=y, random_state=42
)

ValueError: Input contains NaN