In [3]:
# Standard libraries
from datetime import datetime as dt

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import plotly.express as px

# Machine learning – scikit-learn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Class balancing
from imblearn.over_sampling import SMOTE


In [4]:
def read_data():
    df_credits = pd.read_csv('../data/User Credits Student Access.csv', encoding='utf-8')
    df_atlas = pd.read_csv('../data/Atlas Cechu Student Access.csv', encoding='utf-8')
    df_payments = pd.read_csv('../data/Payments Student Access.csv', encoding='utf-8')
    return df_credits, df_atlas, df_payments

def data_cleaning(df_credits, df_payments):
    df_c_negative = df_credits[df_credits['credits']<0]
    df_credits[df_credits.user.isin(df_c_negative.user) == True]

    # IDK what I'm doing with payments
    df_payments[df_payments.user.isin(df_c_negative.user)==True].sort_values(['user','created_at'])
    df_payments.state.unique()
    df_payments[df_payments.user=='STUD54678']
    df_credits[df_credits.user=='STUD54678']

    df_credits_cleaned = df_credits[df_credits['credits']>0]
    df_payments_cleaned = df_payments[df_payments['user'].notna()] # we threw out from payments 2345 observations

    return df_credits_cleaned, df_payments_cleaned

def get_merged_table():
    df_credits, df_atlas, df_payments = read_data()
    _, df_payments_cleaned = data_cleaning(df_credits, df_payments)
    df_merge_full = pd.merge(df_payments_cleaned, df_atlas, how='inner', left_on='user', right_on='user_id')
    df_merge_full.rename(columns={'credits_x':'credits_payments', 'credits_y':'credits_credits'})
    df_merge_full = pd.get_dummies(df_merge_full, columns=['type'], drop_first=True)
    df_merge_full = df_merge_full.astype({col: int for col in df_merge_full.select_dtypes(include='bool').columns})

    df_merge_full['created_at'] = pd.to_datetime(df_merge_full['created_at'])

    df_merge_full['day'] = df_merge_full['created_at'].dt.day
    df_merge_full['month'] = df_merge_full['created_at'].dt.month
    df_merge_full['year'] = df_merge_full['created_at'].dt.year
    df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday

    print(df_merge_full[df_merge_full.year.isin([2025])].shape[0])
    print(df_merge_full[df_merge_full.year.isin([2017])].shape[0])

    df_merge_full = df_merge_full[(df_merge_full['year'] != 2017) & (df_merge_full['year'] != 2025)]
    df_merge_full = df_merge_full.drop(columns=['year'])
    df_merge_full = df_merge_full.drop(columns=['created_at'])
    return df_merge_full

# split dataset into three categories
def get_split_values(df_merge_full, value:int):
    col_name = f'credits_{value}+'
    df_modeling = df_merge_full.drop(columns=['id', 'changed_at', 'user', 'batch', 'state', 'user_id'])
    df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)
    df_modeling = df_modeling.drop(columns=['credits'])

    fig = px.bar(pd.DataFrame(df_modeling[col_name].value_counts().reset_index()), x=col_name, y='count', title='Credits Category Distribution')
    fig.show()
    
    df_modeling = df_modeling[df_modeling[col_name].isnull()==False] # clean data from null values
    return df_modeling, col_name

def split_data(X, y, test_size=0.2, stratify=True, random_state=42):
    """
    Split data into training and testing sets with optional stratification.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data to use as test set.
        stratify (bool): Whether to stratify split based on y.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple: (X_train, X_test, y_train, y_test)
    """
    return train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y if stratify else None,
        random_state=random_state
    )

def get_smote_train(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    return X_train_smote, y_train_smote


In [5]:
df_merge_full = get_merged_table()
df_modeling, colname = get_split_values(df_merge_full, 550)

y = df_modeling[colname]
X = df_modeling.drop(columns=[colname])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y , train_size=0.8, random_state=42)

X_train_smote, y_train_smote = get_smote_train(X_train, y_train)

  df_merge_full['day'] = df_merge_full['created_at'].dt.day
  df_merge_full['month'] = df_merge_full['created_at'].dt.month
  df_merge_full['year'] = df_merge_full['created_at'].dt.year
  df_merge_full['weekday'] = df_merge_full['created_at'].dt.weekday


3956
34


  df_modeling[col_name] = (df_modeling['credits'] > value).astype(int)



`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [6]:
##### Random Forest
param_grid = {
    'n_estimators': np.array(range(100, 1000, 300)),
    'max_depth': np.array(range(5, 15, 3)),
    'criterion': ['gini', 'entropy']
}

model_rf = RandomForestClassifier(random_state=42)

grid_rf = RandomizedSearchCV(model_rf, param_grid, cv=None, scoring='accuracy', n_iter=30, n_jobs=-1)
grid_rf.fit(X_train_smote, y_train_smote)


The total space of parameters 24 is smaller than n_iter=30. Running 24 iterations. For exhaustive searches, use GridSearchCV.


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [7]:
best_rf = grid_rf.best_estimator_
features_importances_rf = best_rf.feature_importances_

y_pred = best_rf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(features_importances_rf)

[[2714  511]
 [ 640  713]]
              precision    recall  f1-score   support

           0       0.81      0.84      0.83      3225
           1       0.58      0.53      0.55      1353

    accuracy                           0.75      4578
   macro avg       0.70      0.68      0.69      4578
weighted avg       0.74      0.75      0.74      4578

[0.00013362 0.00095289 0.00101249 ... 0.02567921 0.02270006 0.01928533]
