In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('Dataset 5.csv')

## Feature Engineering

In [3]:
df.drop(['Unnamed: 0', 'MonthlyIncome', 'NumberOfDependents', 'age'], axis = 1, inplace = True)

In [4]:
# renames columns to increase readability
df.columns = ['Risky Borrower',
              'Revolving Utilization',
              'Late 1',
              'Debt Ratio',
              'Small Loans',
              'Late 3',
              'Real Estate Loans',
              'Late 2']

In [5]:
def rem_outliers(val):
    if val > 1 and val < 10:
        return val / 10
    elif val > 10 and val < 100:
        return val / 100
    elif val > 100 and val < 1000:
        return val / 1000
    elif val > 1000 and val < 10000:
        return val / 10000
    elif val > 10000:
        return val / 100000
    else:
        return val

In [6]:
df['Revolving Utilization'] = df['Revolving Utilization'].apply(lambda x: rem_outliers(x))

In [7]:
df['Debt Ratio'] = df['Debt Ratio'].apply(lambda x: rem_outliers(x))

In [8]:
df.drop(df[(df['Late 1'] > 20) | (df['Late 2'] > 20) | (df['Late 3'] > 20)].index, inplace = True)

In [9]:
df['Light Delinquencies'] = df['Late 1'] + df['Late 2']
df['Serious Delinquencies'] = df['Late 3']

In [10]:
df.drop(index = df[(df['Small Loans'] > 20) | (df['Real Estate Loans'] > 6)].index, inplace = True)

In [11]:
lst = []
for x in df.index:
    if df['Small Loans'][x] + df['Real Estate Loans'][x] == 0:
        lst.append(0)
    else:
        lst.append((df['Light Delinquencies'][x] + df['Serious Delinquencies'][x]) / (df['Small Loans'][x] + df['Real Estate Loans'][x]))

In [12]:
for x in lst:
    if x > 1:
        x = 1

In [13]:
df['Percent Ever Delinquent'] = lst

In [14]:
df.drop(['Late 1', 'Late 2', 'Late 3', 'Small Loans', 'Real Estate Loans'], axis = 1, inplace = True)

In [15]:
# dataset is skewed; too many 0 vals relative to 1 vals

sub1 = df[df['Risky Borrower'] == 0].sample(n = 9000)
sub2 = df[df['Risky Borrower'] == 1].sample(n = 9000)
sample = pd.concat([sub1, sub2])
sample.reset_index(inplace = True)
sample.drop('index', axis = 1, inplace = True)

In [16]:
sample.to_csv('Cleaned Data')

# Test

In [17]:
X = sample.drop('Risky Borrower', axis = 1)
y = sample['Risky Borrower']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [19]:
model = LogisticRegression()

In [20]:
model.fit(X_train, y_train)

LogisticRegression()

In [264]:
model.coef_ = np.array([[1.73453, 1.3275, 0.191847, 0.39283, 1.94902]])

In [265]:
pred = model.predict(X_test)

In [266]:
print(confusion_matrix(y_test, pred))

[[1698  514]
 [ 567 1721]]


In [267]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2212
           1       0.77      0.75      0.76      2288

    accuracy                           0.76      4500
   macro avg       0.76      0.76      0.76      4500
weighted avg       0.76      0.76      0.76      4500



In [289]:
test = pd.DataFrame(data = [[0, 0, 0, 0, 0]], columns = sample.drop('Risky Borrower', axis = 1).columns)

In [290]:
print(model.predict_proba(test)[0][1])
if model.predict_proba(test)[0][1] > 0.6:
    print('You will likely be classified by lending insitutions as a risky borrower.')
    if model.predict_proba(test)[0][1] > 0.8:
        print('Model Conviction: HIGH')
elif model.predict_proba(test)[0][1]:
    print('You will likely be classified by lending institutions as a safe borrower.')
    if model.predict_proba(test)[0][1] < 0.2:
        print('Model Conviction: HIGH')
else:
    print('Based on the feature you have inputted, it is unclear whether you will be classified as safe or risky. Your classification may depend on other factors.')

0.17593625618354905
You will likely be classified by lending institutions as a safe borrower.
Model Conviction: HIGH
