In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Reading data without outliers
df = pd.read_csv('credit_data_process-2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Month,Occupation,Type_of_Loan,Credit_Mix,Payment_of_Min_Amount,Payment_Behaviour,Credit_Score,Age,Annual_Income,...,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_History_Months
0,1,2,12,128,1,1,4,0,23.0,19114.12,...,0.0,31.033317,11.27,4.0,809.98,31.94496,49.574949,118.280222,284.629163,220.840847
1,2,6,12,128,1,1,5,0,0.0,19114.12,...,3.0,7.0,10.389016,4.0,809.98,28.609352,49.574949,81.699521,331.209863,267.0
2,3,0,12,128,1,1,6,0,23.0,19114.12,...,5.0,4.0,6.27,4.0,809.98,31.377862,49.574949,199.458074,223.45131,268.0
3,4,7,12,128,1,1,2,0,23.0,19114.12,...,6.0,31.033317,11.27,4.0,809.98,24.797347,49.574949,41.420153,341.489231,269.0
4,5,5,12,128,1,1,0,0,23.0,19114.12,...,8.0,4.0,9.27,4.0,809.98,27.262259,49.574949,62.430172,340.479212,270.0


In [3]:
df = df.drop(['Unnamed: 0'],axis=1)

In [4]:
df.columns

Index(['Month', 'Occupation', 'Type_of_Loan', 'Credit_Mix',
       'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Score', 'Age',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance',
       'Credit_History_Months'],
      dtype='object')

In [5]:
# Selecting Top 10 features from the feature selection
# Top 10 features
# 1. Outstanding_Debt  
# 2. Annual_Income
# 3. Total_EMI_per_month 
# 4. Type_of_Loan
# 5. Interest_Rate
# 6. Credit_Mix
# 7. Delay_from_due_date  
# 8. Num_Credit_Inquiries
# 9. Payment_of_Min_Amount 
# 10. Num_Credit_Card 

In [6]:
df_1 = df[['Type_of_Loan', 'Credit_Mix', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Total_EMI_per_month',
           'Credit_Score','Num_Bank_Accounts','Changed_Credit_Limit','Credit_History_Months','Num_of_Loan','Num_of_Delayed_Payment']]

In [7]:
X = df_1.drop(['Credit_Score'],axis=1)
y = df_1['Credit_Score']

In [8]:

scaler = MinMaxScaler()
model=scaler.fit(X)
scaled_X=model.transform(X)
scaled_X

array([[0.02045055, 0.33333333, 0.08770105, ..., 0.54663576, 0.42105263,
        0.85022788],
       [0.02045055, 0.33333333, 0.08770105, ..., 0.66089109, 0.42105263,
        0.19178082],
       [0.02045055, 0.33333333, 0.08770105, ..., 0.66336634, 0.42105263,
        0.10958904],
       ...,
       [0.10912286, 0.33333333, 0.23629269, ..., 0.94059406, 0.21052632,
        0.16438356],
       [0.10912286, 0.33333333, 0.23629269, ..., 0.94306931, 0.21052632,
        0.85022788],
       [0.10912286, 0.33333333, 0.22123968, ..., 0.94554455, 0.21052632,
        0.16438356]])

In [9]:
#splitting data into train,validation and test datasets
X_train, X_rem, y_train, y_rem = train_test_split(scaled_X,y, train_size=0.5,random_state=42)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5,random_state=42)

In [10]:
# shapes of train,validation and  test split
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(49999, 15)
(49999,)
(25000, 15)
(25000,)
(25000, 15)
(25000,)


(None, None)

# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [12]:
#fitting a random model 
log = LogisticRegression()
log.fit(X_train,y_train)

In [13]:
#random model on the validation dataset
y_valid_pred = log.predict(X_valid)
acc = accuracy_score(y_valid,y_valid_pred)

print(f"The accuracy for the model is {acc}")

The accuracy for the model is 0.6326


# Finding best parameters using Grid Search

In [14]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear','saga']
penalty = ['l2','l1','none']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

450 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: S

In [15]:
#best parameters
print(grid_result.best_params_)

{'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}


In [16]:
# fitting Logistic model using above parameters
best_model = LogisticRegression(penalty='l1',solver='saga',C=1.0)
best_model.fit(X_train,y_train)

In [17]:
#best model on the test dataset
y_test_pred = best_model.predict(X_test)
acc = accuracy_score(y_test,y_test_pred)

print(f"The accuracy for the model is {acc}")

The accuracy for the model is 0.6254


In [18]:
# Confusion matrx for the model
cm = confusion_matrix(y_test, y_test_pred)
print(cm)

[[ 1780    71  2566]
 [  288  3739  3210]
 [ 1151  2079 10116]]


In [19]:
#Classification Report for the model
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.55      0.40      0.47      4417
           1       0.63      0.52      0.57      7237
           2       0.64      0.76      0.69     13346

    accuracy                           0.63     25000
   macro avg       0.61      0.56      0.58     25000
weighted avg       0.62      0.63      0.62     25000



# PCA model

In [26]:
#Reading pca data 
df = pd.read_csv('Credit_data_PCA.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,Credit_Score
0,0,-0.136363,-0.370934,0.041678,0.263842,-0.222105,0.292258,0.251458,-0.438477,-0.19056,...,0.547227,-0.097278,-0.341116,-0.124602,-0.107184,-0.040613,-0.066402,0.042967,0.056982,0
1,1,-0.226325,-0.463382,0.151887,0.296798,0.345578,0.304052,0.211837,-0.472029,-0.316839,...,-0.020493,-0.073904,-0.188365,-0.469507,0.04176,-0.076142,-0.225868,-0.041018,0.090555,0
2,2,-0.273298,-0.478896,0.422737,0.29527,-0.517704,0.286965,0.194752,-0.482383,-0.322554,...,-0.117989,0.019139,-0.105781,-0.162512,-0.098846,-0.021087,-0.085814,-0.016029,0.08419,0
3,3,-0.126936,-0.375277,-0.303028,0.312867,0.500363,0.306469,0.272134,-0.428103,-0.231405,...,0.576941,0.048185,-0.21029,-0.179118,0.028326,-0.181783,-0.088332,0.009019,0.133855,0
4,4,-0.337219,-0.249693,-0.537153,0.358871,0.219459,0.308859,0.244402,-0.453667,-0.36867,...,-0.019088,0.321884,-0.02955,-0.192756,-0.054961,-0.077647,-0.08189,-0.034598,0.105237,0


In [27]:
X = df.drop(['Credit_Score','Unnamed: 0'],axis=1)
y = df['Credit_Score']

In [28]:
#splitting data into train,validation and test datasets
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.5,random_state=42)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5,random_state=42)
# shapes of train,validation and  test split
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(49999, 19)
(49999,)
(25000, 19)
(25000,)
(25000, 19)
(25000,)


(None, None)

In [29]:
#fitting a random model 
log = LogisticRegression()
log.fit(X_train,y_train)

In [30]:
#random model on the validation dataset
y_valid_pred = log.predict(X_valid)
acc = accuracy_score(y_valid,y_valid_pred)

print(f"The accuracy for the model is {acc}")

The accuracy for the model is 0.63304


# Finding best parameters using grid search

In [89]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear','saga']
penalty = ['l2','l1','none']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

450 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: S

In [90]:
grid_result.best_params_

{'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}

In [83]:
# fitting Logistic model using above parameters
best_model = LogisticRegression(penalty='l1',solver='saga',C=1.0)
best_model.fit(X_train,y_train)

In [91]:
# fitting Logistic model using above parameters
best_model = LogisticRegression(penalty='l1',solver='saga',C=1.0)
best_model.fit(X_train,y_train)

In [92]:
#best model on the test dataset
y_test_pred = best_model.predict(X_test)
acc = accuracy_score(y_test,y_test_pred)

print(f"The accuracy for the model is {acc}")

The accuracy for the model is 0.61252


In [93]:
# Confusion matrx for the model
cm = confusion_matrix(y_test, y_test_pred)
print(cm)

[[ 1587    77  2753]
 [  194  3697  3346]
 [  970  2347 10029]]


In [94]:
#Classification Report for the model
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.58      0.36      0.44      4417
           1       0.60      0.51      0.55      7237
           2       0.62      0.75      0.68     13346

    accuracy                           0.61     25000
   macro avg       0.60      0.54      0.56     25000
weighted avg       0.61      0.61      0.60     25000

