In [1]:
import sys

IN_COLAB = "google.colab" in sys.modules
IN_COLAB

True

In [2]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
if IN_COLAB:
    # Put here the full path to the folder having your Sprint project code
    # e.g. "/content/drive/MyDrive/assignment"
    ROOT_DIR = "/content/drive/MyDrive/'Colab Notebooks'/AnyoneAI/final_project/main"
    %cd $ROOT_DIR

/content/drive/MyDrive/Colab Notebooks/AnyoneAI/final_project/main


In [4]:
pip install boto3 python_dotenv

Collecting boto3
  Downloading boto3-1.28.13-py3-none-any.whl (135 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m81.9/135.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python_dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting botocore<1.32.0,>=1.31.13 (from boto3)
  Downloading botocore-1.31.13-py3-none-any.whl (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0 (from boto3)
  Downloading s3transfer-0.6.1-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from scipy.stats import randint

from src import config, data_utils, evaluation, plot

# Ignore warnings
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

In [6]:
dataset = pd.read_csv('clean_data.csv')

In [7]:
# perform random downsampling
sample_size = dataset['TARGET_LABEL_BAD=1'].value_counts().min()
dataset_resampled = pd.concat([dataset[dataset['TARGET_LABEL_BAD=1'] == c].sample(sample_size) for c in dataset['TARGET_LABEL_BAD=1'].unique()])
print('Resampled dataset shape %s' % dataset_resampled['TARGET_LABEL_BAD=1'].value_counts())

Resampled dataset shape 1    13018
0    13018
Name: TARGET_LABEL_BAD=1, dtype: int64


In [8]:
# first split in sets
app_train_set, app_val_set, app_test_set = data_utils.get_feature_in_set(dataset_resampled)

In [9]:
# second split features and target
train_prep, y_train, val_prep, y_val, test_prep, y_test = data_utils.get_feature_target(app_train_set,app_val_set,app_test_set)

In [10]:
# third apply preprocess in sets
X_train, X_val, X_test, columns = data_utils.preprocess_data(train_prep,val_prep,test_prep)

Input train data shape:  (18745, 20)
Input val data shape:  (2083, 20)
Input test data shape:  (5208, 20) 

cat_cols:  Index(['PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE', 'SEX', 'MARITAL_STATUS',
       'RESIDENCIAL_STATE', 'FLAG_RESIDENCIAL_PHONE', 'MONTHS_IN_RESIDENCE',
       'COMPANY', 'FLAG_PROFESSIONAL_PHONE', 'AGE', 'HAS_DEPENDANTS',
       'HAS_RESIDENCE', 'MONTHLY_INCOMES_TOT', 'HAS_CARDS',
       'HAS_BANKING_ACCOUNTS', 'HAS_PERSONAL_ASSETS', 'HAS_CARS'],
      dtype='object')
['QUANT_DEPENDANTS', 'FLAG_EMAIL', 'PRODUCT']
Index(['PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE', 'SEX',
       'FLAG_RESIDENCIAL_PHONE', 'COMPANY', 'FLAG_PROFESSIONAL_PHONE',
       'HAS_DEPENDANTS', 'HAS_RESIDENCE', 'HAS_CARDS', 'HAS_BANKING_ACCOUNTS',
       'HAS_PERSONAL_ASSETS', 'HAS_CARS'],
      dtype='object')
Index(['MARITAL_STATUS', 'RESIDENCIAL_STATE', 'MONTHS_IN_RESIDENCE', 'AGE',
       'MONTHLY_INCOMES_TOT'],
      dtype='object')


In [11]:
model = RandomForestClassifier()

In [12]:
param_dist = {
    "n_estimators": randint(100, 500),
    "max_depth": [3, 5, 10, None],
    "max_features": randint(1, 11),
    "min_samples_split": randint(2, 11),
    "min_samples_leaf": randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
}

In [13]:
baseline_model = RandomizedSearchCV(model,param_dist,scoring='accuracy',n_iter=15)

In [14]:
baseline_model.fit(X_train, y_train)

In [15]:
y_hat = baseline_model.predict(X_test)

In [16]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.60      0.53      0.56      2629
           1       0.57      0.64      0.60      2579

    accuracy                           0.58      5208
   macro avg       0.59      0.58      0.58      5208
weighted avg       0.59      0.58      0.58      5208



In [17]:
# Best param_num_leaves:90, param_min_child_samples:20, param_max_depth:5, param_learning_rate:0.1

In [18]:
results = pd.DataFrame(baseline_model.cv_results_)
results.sort_values(by='rank_test_score', ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,1.833418,0.364103,0.156417,0.013641,False,entropy,,2,9,4,236,"{'bootstrap': False, 'criterion': 'entropy', '...",0.591091,0.591358,0.574553,0.590291,0.57642,0.584743,0.007588,1
6,1.377871,0.012282,0.116876,0.003005,True,entropy,,4,5,4,162,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.585756,0.593225,0.574286,0.585756,0.576154,0.583035,0.006965,2
3,2.739134,0.52027,0.170722,0.027536,False,gini,5.0,5,6,2,412,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.592158,0.586556,0.576954,0.585223,0.57402,0.582982,0.006613,3
5,1.980312,0.37641,0.124914,0.008748,True,gini,,6,5,3,153,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.583622,0.594025,0.567618,0.583889,0.576687,0.581168,0.008745,4
13,2.420599,0.515718,0.135903,0.022639,False,gini,5.0,7,1,7,310,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.587623,0.579621,0.576954,0.589757,0.571352,0.581062,0.00681,5
4,2.068528,0.419255,0.103405,0.005576,False,entropy,,6,6,5,121,"{'bootstrap': False, 'criterion': 'entropy', '...",0.584956,0.592691,0.568685,0.58629,0.568418,0.580208,0.009871,6
8,2.119426,0.435828,0.214295,0.027934,False,entropy,10.0,1,10,8,417,"{'bootstrap': False, 'criterion': 'entropy', '...",0.588157,0.579088,0.570552,0.58629,0.57562,0.579941,0.006563,7
12,0.724564,0.148668,0.054151,0.008527,True,entropy,3.0,2,9,9,146,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.585756,0.582822,0.57322,0.588957,0.564684,0.579088,0.008919,8
1,2.818713,1.377412,0.216029,0.080985,False,gini,10.0,1,6,3,260,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.588424,0.582022,0.568152,0.58629,0.569485,0.578874,0.008476,9
11,2.372432,0.240572,0.169385,0.031793,False,gini,3.0,5,10,10,486,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.592691,0.579088,0.572153,0.581755,0.56415,0.577967,0.009562,10
