In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('Dataset 1.csv')

## Feature Engineering

In [3]:
df.drop(['Unnamed: 0', 'MonthlyIncome', 'NumberOfDependents', 'age'], axis = 1, inplace = True)

In [4]:
# renames columns to increase readability
df.columns = ['Risky Borrower',
              'Revolving Utilization',
              'Late 1',
              'Debt Ratio',
              'Small Loans',
              'Late 3',
              'Real Estate Loans',
              'Late 2']

In [5]:
def rem_outliers(val):
    if val > 1 and val < 10:
        return val / 10
    elif val > 10 and val < 100:
        return val / 100
    elif val > 100 and val < 1000:
        return val / 1000
    elif val > 1000 and val < 10000:
        return val / 10000
    elif val > 10000:
        return val / 100000
    else:
        return val

In [6]:
df['Revolving Utilization'] = df['Revolving Utilization'].apply(lambda x: rem_outliers(x))

In [7]:
df['Debt Ratio'] = df['Debt Ratio'].apply(lambda x: rem_outliers(x))

In [8]:
df.drop(df[(df['Late 1'] > 20) | (df['Late 2'] > 20) | (df['Late 3'] > 20)].index, inplace = True)

In [9]:
df['Light Delinquencies'] = df['Late 1'] + df['Late 2']
df['Serious Delinquencies'] = df['Late 3']

In [10]:
df.drop(index = df[(df['Small Loans'] > 20) | (df['Real Estate Loans'] > 6)].index, inplace = True)

In [11]:
lst = []
for x in df.index:
    if df['Small Loans'][x] + df['Real Estate Loans'][x] == 0:
        lst.append(0)
    else:
        lst.append((df['Light Delinquencies'][x] + df['Serious Delinquencies'][x]) / (df['Small Loans'][x] + df['Real Estate Loans'][x])*100)

In [12]:
df['Percent Ever Delinquent'] = lst

In [13]:
df['Percent Ever Delinquent'] = df['Percent Ever Delinquent'].apply(lambda x: 100 if x > 100 else x)

In [14]:
df.drop(['Late 1', 'Late 2', 'Late 3', 'Small Loans', 'Real Estate Loans'], axis = 1, inplace = True)

In [15]:
# dataset is skewed; too many 0 vals relative to 1 vals

sub1 = df[df['Risky Borrower'] == 0].sample(n = 9000)
sub2 = df[df['Risky Borrower'] == 1].sample(n = 9000)
sample = pd.concat([sub1, sub2])
sample.reset_index(inplace = True)
sample.drop('index', axis = 1, inplace = True)

In [16]:
sample.to_csv('Cleaned Data')

## Model

In [17]:
scaler = MinMaxScaler()

In [18]:
scaled_features = scaler.fit_transform(sample.drop('Risky Borrower', axis = 1))

In [19]:
X = scaled_features #sample.drop('Risky Borrower', axis = 1)
y = sample['Risky Borrower']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [21]:
model = LogisticRegression()

In [22]:
model.fit(X_train, y_train)

LogisticRegression()

In [23]:
model.coef_ = np.array([[ 1.2161838, 132.85124146, 2.82177439, 4.45186543,  9.03341318]])

In [24]:
pred = model.predict(X_test)

In [25]:
print(confusion_matrix(y_test, pred))

[[1689  523]
 [ 571 1717]]


In [26]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.75      0.76      0.76      2212
           1       0.77      0.75      0.76      2288

    accuracy                           0.76      4500
   macro avg       0.76      0.76      0.76      4500
weighted avg       0.76      0.76      0.76      4500

