In [1]:
import sys

IN_COLAB = "google.colab" in sys.modules
IN_COLAB

True

In [2]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
if IN_COLAB:
    # Put here the full path to the folder having your Sprint project code
    # e.g. "/content/drive/MyDrive/assignment"
    ROOT_DIR = "/content/drive/MyDrive/AnyoneAI/main"
    %cd $ROOT_DIR

/content/drive/MyDrive/AnyoneAI/main


In [4]:
pip install boto3 python_dotenv

Collecting boto3
  Downloading boto3-1.28.13-py3-none-any.whl (135 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m81.9/135.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python_dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting botocore<1.32.0,>=1.31.13 (from boto3)
  Downloading botocore-1.31.13-py3-none-any.whl (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0 (from boto3)
  Downloading s3transfer-0.6.1-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━

In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, KFold

from src import config, data_utils, evaluation, plot

# Ignore warnings
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

In [6]:
dataset = pd.read_csv('dataset.csv')

In [7]:
# perform random downsampling
sample_size = dataset['TARGET_LABEL_BAD=1'].value_counts().min()
dataset_resampled = pd.concat([dataset[dataset['TARGET_LABEL_BAD=1'] == c].sample(sample_size) for c in dataset['TARGET_LABEL_BAD=1'].unique()])
print('Resampled dataset shape %s' % dataset_resampled['TARGET_LABEL_BAD=1'].value_counts())

Resampled dataset shape 1    13018
0    13018
Name: TARGET_LABEL_BAD=1, dtype: int64


In [8]:
# first split in sets
app_train_set, app_val_set, app_test_set = data_utils.get_feature_in_set(dataset_resampled)

In [9]:
# second split features and target
train_prep, y_train, val_prep, y_val, test_prep, y_test = data_utils.get_feature_target(app_train_set,app_val_set,app_test_set)

In [10]:
# third apply preprocess in sets
X_train, X_val, X_test, columns = data_utils.preprocess_data(train_prep,val_prep,test_prep)

Input train data shape:  (18745, 17)
Input val data shape:  (2083, 17)
Input test data shape:  (5208, 17) 

cat_cols:  Index(['APPLICATION_SUBMISSION_TYPE', 'SEX', 'RESIDENCIAL_STATE',
       'FLAG_RESIDENCIAL_PHONE', 'COMPANY', 'FLAG_PROFESSIONAL_PHONE',
       'PAYMENT_DAY', 'MARITAL_STATUS', 'NACIONALITY', 'RESIDENCE_TYPE',
       'MONTHLY_INCOMES_TOT', 'FLAG_CARDS', 'QUANT_BANKING_ACCOUNTS_TOT',
       'PERSONAL_ASSETS_VALUE', 'QUANT_CARS'],
      dtype='object')
['QUANT_DEPENDANTS', 'MONTHS_IN_RESIDENCE']
Index(['SEX', 'FLAG_RESIDENCIAL_PHONE', 'COMPANY', 'FLAG_PROFESSIONAL_PHONE',
       'PAYMENT_DAY', 'NACIONALITY', 'FLAG_CARDS',
       'QUANT_BANKING_ACCOUNTS_TOT', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS'],
      dtype='object')
Index(['APPLICATION_SUBMISSION_TYPE', 'RESIDENCIAL_STATE', 'MARITAL_STATUS',
       'RESIDENCE_TYPE', 'MONTHLY_INCOMES_TOT'],
      dtype='object')


In [11]:
model = SVC()

In [19]:
param_grid = dict(
    kernel = ['poly'],
    degree=[1,2],
    C=[1,3,5])

In [13]:
cv = KFold(n_splits=20, random_state=2, shuffle=True)

In [14]:
grid_model = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-2, cv=cv)

In [15]:
grid_model.fit(X_train, y_train)

In [16]:
print("The best score is: %f with the following hyperparameters %s" % (grid_model.best_score_, grid_model.best_params_))

The best score is: 0.569804 with the following hyperparameters {'C': 3, 'degree': 2, 'kernel': 'poly'}


In [17]:
y_hat=grid_model.predict(X_test)

In [18]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.56      0.56      0.56      2629
           1       0.55      0.55      0.55      2579

    accuracy                           0.55      5208
   macro avg       0.55      0.55      0.55      5208
weighted avg       0.55      0.55      0.55      5208



In [20]:
model = SVC()
param_grid = dict(
    kernel = ['poly'],
    degree=[3,5],
    C=[1,3,5])
cv = KFold(n_splits=20, random_state=2, shuffle=True)
grid_model = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-2, cv=cv)

In [21]:
grid_model.fit(X_train, y_train)

In [22]:
print("The best score is: %f with the following hyperparameters %s" % (grid_model.best_score_, grid_model.best_params_))

The best score is: 0.562069 with the following hyperparameters {'C': 1, 'degree': 3, 'kernel': 'poly'}


In [None]:
y_hat=grid_model.predict(X_test)