In [None]:
"""
Created on Mon Jul  8 12:07:07 2019

@author: vieth
"""
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression

## Dataset overview

In [None]:
loan_df = pd.read_csv('data/LoanStats_2018Q2.csv', skiprows=1, skipfooter = 2)
# Normalize data
loan_df = loan_df.fillna(axis=0, method='ffill')

rj_loan_df = pd.read_csv('data/RejectStats_2018Q2.csv', skiprows=1, skipfooter = 2)
#Normalize data
rj_loan_df = rj_loan_df.fillna(axis=0, method='ffill')

loan_df = loan_df.drop(columns = "term")
loan_df

## Data cleansing and preparation

In [None]:
print ("\n")
print ("==========Approve loan=========")

avg_loan_amt = loan_df.loc[:, "loan_amnt"].mean()
print("Average loan amount:", avg_loan_amt)

avg_length_amt = loan_df.loc[:, "term_months"].mean()
print("Average length:", avg_length_amt, "months")

avg_annual_inc_amt = loan_df.loc[:, "annual_inc"].mean()
print("Average annual incom amount:", avg_annual_inc_amt)

# Transform employment length into a clear structure
# Assume all n/a is 0, and all length less then 1 years is 0
loan_df['emp_length'] = loan_df['emp_length'].replace("< 1 year", '0 year')
loan_df['emp_length'] = loan_df['emp_length'].replace(0, '0 year')

#remove "years" out of column emp_length
loan_df['emp_length'] = loan_df['emp_length'].apply(lambda x: int(x[0]))

avg_emp_length_amt = loan_df.loc[:, "emp_length"].mean()
print("Average employment length:", avg_emp_length_amt, "years")

avg_debt_inc_ratio = avg_loan_amt/avg_annual_inc_amt
print("Average debt-to-income ratio:", avg_debt_inc_ratio)

loan_approval = len (loan_df)
print ("Approved loans:",loan_approval)

In [None]:
# Do the same manupilation for reject
print ("==========Reject loan=========")
avg_rj_loan_amt = rj_loan_df.loc[:, "Amount Requested"].mean()
print("Average reject loan amount:", avg_rj_loan_amt)

# Transform employment length into a clear structure
# Assume all n/a is 0, and all length less then 1 years is 0
rj_loan_df['Employment Length'] = rj_loan_df['Employment Length'].replace("< 1 year", '0 year')
rj_loan_df['Employment Length'] = rj_loan_df['Employment Length'].replace(0, '0 year')

#remove "years" out of column emp_length
rj_loan_df['Employment Length'] = rj_loan_df['Employment Length'].apply(lambda x: int(x[0]))

avg_rj_emp_length_amt = rj_loan_df.loc[:, "Employment Length"].apply(lambda x: int(x)).mean()
print("Average reject employment length:", avg_rj_emp_length_amt, "years")

avg_rj_debt_inc_ratio = rj_loan_df.loc[:, "debt-to-inc ratio"].mean()
print("Average reject debt-to-income ratio:", avg_rj_debt_inc_ratio)

loan_denied = len (rj_loan_df)
print ("Denied loans:",loan_denied)

print ("\n")
#Re-engineer ranking loanee
over_view_df = loan_df.groupby("grade").id.nunique()
print ("Approved loanee")
print (over_view_df)

In [None]:
#Draw overview, as seen the majority of loanee are A,B and C with over 70% loans in estimation
over_view_df.plot.bar()

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

loan_df_shaped = loan_df[['loan_amnt','grade','home_ownership',
                         'annual_inc','verification_status']]
#loan_df_shaped.int_rate = loan_df_shaped.int_rate.str.replace('%', '')
loan_df_shaped = loan_df_shaped.dropna()

loan_df_data = loan_df_shaped.drop(['grade'], axis=1)
loan_df_target = loan_df_shaped['grade']
loan_df_data_train, loan_df_data_test, loan_df_target_train, loan_df_target_test = train_test_split(
    loan_df_data, loan_df_target, test_size=0.25, random_state=42);

loan_df_data_train

## Naive Bayes

In [None]:
#from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline

vec = DictVectorizer(sparse=False, dtype=int)
loan_dict = loan_df_data_train.to_dict('records');
vec_train = vec.fit_transform(loan_dict);

In [None]:
model = GaussianNB()
#vec_train
model.fit(vec_train, loan_df_target_train.array)

In [None]:
loan_dict_test = loan_df_data_test.to_dict('records');
vec_test = vec.fit_transform(loan_dict_test);
labels = model.predict(vec_test)
#len(vec_test[0])

In [None]:
#from sklearn.metrics import confusion_matrix
import seaborn as sns
#mat = confusion_matrix(loan_df_target_test.array, labels)
#sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
#            xticklabels=loan_df_target_train.array, yticklabels=loan_df_target_train.array)
#plt.xlabel('true label')
#plt.ylabel('predicted label');

In [None]:
from sklearn.metrics import accuracy_score

#y_pred = nv.predict(X_test) # store the prediction data
accuracy_score(loan_df_target_test.array,labels)

## K-means clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=7)

In [None]:
loan_dict_data = loan_df_data.to_dict('records');
vec_kmean = vec.fit_transform(loan_dict_data);

kmeans.fit(vec_kmean)
y_kmeans = kmeans.predict(vec_kmean)

In [None]:
y_kmeans

In [None]:
plt.scatter(vec_kmean[:, 0], vec_kmean[:, 1], c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

In [None]:
vec_kmean_target = []
for val in loan_df_target:
    if val == 'A':
        vec_kmean_target.append(1)
    if val == 'B':
        vec_kmean_target.append(5)
    if val == 'C':
        vec_kmean_target.append(2)
    if val == 'D':
        vec_kmean_target.append(0)
    if val == 'E':
        vec_kmean_target.append(3)
    if val == 'F':
        vec_kmean_target.append(4)
    if val == 'G':
        vec_kmean_target.append(6)

In [None]:
#loan_dict_target = loan_df_target.to_dict('records');
#vec_kmean_target = vec.fit_transform(loan_dict_target);

accuracy_score(vec_kmean_target,y_kmeans)

In [None]:
loan_df_shaped2 = loan_df[['loan_amnt','grade','emp_title']]
loan_df_shaped2 = loan_df_shaped2.dropna()

loan_df_data2 = loan_df_shaped2.drop(['grade'], axis=1)
loan_df_target2 = loan_df_shaped2['grade']

In [None]:
loan_dict_data2 = loan_df_data2.to_dict('records');
vec_kmean2 = vec.fit_transform(loan_dict_data2);

kmeans.fit(vec_kmean2)
y_kmeans2 = kmeans.predict(vec_kmean2)