In [1]:
import numpy as np
import pandas as pd

df_original = pd.read_csv("original_dataset_credit_score_corrected.csv")

In [2]:
X2 = df_original.drop('Loan Status',1)
Y2 = df_original['Loan Status']

In [3]:
###########################################Creating the new csv with Feature Tools created features ##############
import sklearn.feature_selection 

#Such a large set of features may cause over fitting  and also slow computing
#Use feature selection  to select the most importatnt features

select = sklearn.feature_selection.SelectKBest(k=11)
selected_features = select.fit(X2, Y2)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [X2.columns[i] for i in indices_selected]

X2 = X2[colnames_selected]

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X2, Y2,
                                                    stratify=Y2, 
                                                    test_size=0.25)

In [5]:
############################### Down Sampling of majority class #####################################

from sklearn.utils import resample

df_original = X_train.reset_index(drop=True).merge(y_train.reset_index(drop=True), left_index=True, right_index=True)

# Separate majority and minority classes
df_majority = df_original[df_original['Loan Status']==0]
df_minority = df_original[df_original['Loan Status']==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=10000,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [6]:
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE

X2 = df_downsampled.drop('Loan Status',1)
Y2 = df_downsampled['Loan Status']

sm = SMOTE(random_state=12, ratio = 1.0)
X_train, y_train = sm.fit_sample(X2, Y2)



In [7]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn import svm as SVM
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer

cv = StratifiedKFold(n_splits=10, random_state=42)

In [10]:
################################### SVC Linear ##############################################
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy
#param values
penal=["l2"]
losses=["hinge","squared_hinge"]
a=numpy.arange(.00000001, .0001, .00005)
tolerance=a.tolist()
c_regularization=range(10,5000,50)
intercept_scale=range(1,100,4)
max_iters=range(500,4000,100)
#all param-val dictionary
#grid_params_lr = dict('C':[C_regularization], 'penalty':["l1","l2"], 'intercept_scaling':[ntercept_scal_vals], 'max_iter':[max_iter_vals], 'solver' :["newton-cg", "llbfgs", "sag"])
grid_params_svm = dict(penalty=penal, loss=losses, tol=tolerance, C=c_regularization, max_iter=max_iters)
#creating  grid instance  
#KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2)
svm = LinearSVC(fit_intercept=False)
#neigh_grid=GridSearchCV(knn,grid_params_nn,cv=10)
svm_ins=RandomizedSearchCV(svm, grid_params_svm,cv=10,scoring="recall", verbose=10, n_jobs=-1, n_iter=1000)
svm_ins.fit(X_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3

[Parallel(n_jobs=-1)]: Done 9232 tasks      | elapsed: 139.2min
[Parallel(n_jobs=-1)]: Done 9369 tasks      | elapsed: 141.6min
[Parallel(n_jobs=-1)]: Done 9506 tasks      | elapsed: 143.6min
[Parallel(n_jobs=-1)]: Done 9645 tasks      | elapsed: 145.3min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 147.5min
[Parallel(n_jobs=-1)]: Done 9925 tasks      | elapsed: 149.7min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed: 150.6min finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                       fit_intercept=False, intercept_scaling=1,
                                       loss='squared_hinge', max_iter=1000,
                                       multi_class='ovr', penalty='l2',
                                       random_state=None, tol=0.0001,
                                       verbose=0),
                   iid='warn', n_iter=1000, n_jobs=-1,
                   param_distributions={'C': range(10, 5000, 50),
                                        'loss': ['hinge', 'squared_hinge'],
                                        'max_iter': range(500, 4000, 100),
                                        'penalty': ['l2'],
                                        'tol': [1e-08, 5.0010000000000004e-05]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_tr

In [11]:
svm = svm_ins.best_estimator_
svm_ins.best_params_

{'C': 10,
 'loss': 'squared_hinge',
 'max_iter': 3400,
 'penalty': 'l2',
 'tol': 1e-08}

In [12]:
svm_ins.best_score_

0.9019

In [13]:
from sklearn.metrics import classification_report

####### i want to use svm_ins.best_estimator_ ################

y_pred = svm.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.32      0.47      7243
           1       0.20      0.90      0.33      1374

    accuracy                           0.41      8617
   macro avg       0.57      0.61      0.40      8617
weighted avg       0.82      0.41      0.45      8617

