In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

## Reading and understanding Dataset

In [2]:
df = pd.read_csv("a_Dataset_CreditScoring.csv")
df.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,92%,21%,2,3,7,21%,4,4,0%
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,80%,0%,0,0,0,100%,12,0,100%
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,36%,65%,0,1,1,73%,1,1,53%
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,91%,25%,1,1,1,75%,7,1,133%
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,125%,0%,0,1,4,14%,3,1,0%


In [3]:
df = df.drop("ID", axis =1)
df.head()

Unnamed: 0,TARGET,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,3,3,0,4,0.0,5,117,27,0,...,3.0,92%,21%,2,3,7,21%,4,4,0%
1,1,15,9,0,3,1.0,3,14,14,0,...,1.0,80%,0%,0,0,0,100%,12,0,100%
2,1,0,0,0,1,5.0,1,354,7,0,...,5.0,36%,65%,0,1,1,73%,1,1,53%
3,1,8,5,0,6,1.0,10,16,4,0,...,3.0,91%,25%,1,1,1,75%,7,1,133%
4,1,3,1,0,9,0.0,8,130,52,0,...,1.0,125%,0%,0,1,4,14%,3,1,0%


In [4]:
df.shape

(3000, 29)

In [5]:
df.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

## Preprocessing Data

In [6]:
df=df.fillna(0)

In [7]:
df.head()

Unnamed: 0,TARGET,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,3,3,0,4,0.0,5,117,27,0,...,3.0,92%,21%,2,3,7,21%,4,4,0%
1,1,15,9,0,3,1.0,3,14,14,0,...,1.0,80%,0%,0,0,0,100%,12,0,100%
2,1,0,0,0,1,5.0,1,354,7,0,...,5.0,36%,65%,0,1,1,73%,1,1,53%
3,1,8,5,0,6,1.0,10,16,4,0,...,3.0,91%,25%,1,1,1,75%,7,1,133%
4,1,3,1,0,9,0.0,8,130,52,0,...,1.0,125%,0%,0,1,4,14%,3,1,0%


In [8]:
df.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [9]:
df["TARGET"].value_counts()

TARGET
0    2500
1     500
Name: count, dtype: int64

In [10]:
def remove_first_letter(df, column_name):
    df[column_name] = df[column_name].str[1:]
    return df

df = remove_first_letter(df, 'TLSum')
df = remove_first_letter(df, 'TLMaxSum')

In [11]:
def remove_commas(df, column_name):
    df[column_name] = df[column_name].str.replace(',', '')
    return df

# Applying the function
df = remove_commas(df, 'TLSum')
df = remove_commas(df, 'TLMaxSum')

In [12]:
df['TLSum'] = df['TLSum'].replace("", np.nan, regex=True)
df['TLMaxSum'] = df['TLMaxSum'].replace("", np.nan, regex=True)

In [13]:
def convert_percentage(column):
    return column.str.replace('%', '').astype(float)

# Applying the function
df['TLBalHCPct'] = convert_percentage(df['TLBalHCPct'])
df['TLSatPct'] = convert_percentage(df['TLSatPct'])
df['TLOpenPct'] = convert_percentage(df['TLOpenPct'])
df['TLOpen24Pct'] = convert_percentage(df['TLOpen24Pct'])

In [14]:
df.isna().sum()

TARGET              0
DerogCnt            0
CollectCnt          0
BanruptcyInd        0
InqCnt06            0
InqTimeLast         0
InqFinanceCnt24     0
TLTimeFirst         0
TLTimeLast          0
TLCnt03             0
TLCnt12             0
TLCnt24             0
TLCnt               0
TLSum              40
TLMaxSum           40
TLSatCnt            0
TLDel60Cnt          0
TLBadCnt24          0
TL75UtilCnt         0
TL50UtilCnt         0
TLBalHCPct         41
TLSatPct            4
TLDel3060Cnt24      0
TLDel90Cnt24        0
TLDel60CntAll       0
TLOpenPct           3
TLBadDerogCnt       0
TLDel60Cnt24        0
TLOpen24Pct         3
dtype: int64

In [15]:
df = df.fillna(1)
df.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

## Creating Training set and testing set

In [16]:
X = df.drop("TARGET", axis = 1)
y = df[['TARGET']]
X.shape

(3000, 28)

In [18]:
X = X.fillna(0)

X = X.apply(pd.to_numeric, errors='coerce')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train = X_train.fillna(0)
X_train = X_train.fillna(0)


In [19]:
sc = StandardScaler()
X_test = sc.fit_transform(X_test)
X_train = sc.fit_transform(X_train)

## Creating ML model

In [20]:
reg = LogisticRegression()
reg.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [22]:
y_pred = reg.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[465,  17],
       [ 97,  21]])

In [24]:
accuracy_score(y_test, y_pred)

0.81

In [26]:
prediction = reg.predict_proba(X_test)
prediction

array([[0.80815837, 0.19184163],
       [0.91818409, 0.08181591],
       [0.71731102, 0.28268898],
       ...,
       [0.54587154, 0.45412846],
       [0.97794858, 0.02205142],
       [0.98800271, 0.01199729]])