In [None]:

import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
#%matplotlib inline



# =============================================================================
# Load Data
# =============================================================================

#!wget -O loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv
df = pd.read_csv('loan_train.csv')
print(df.head())
print(df.shape)

df['due_date'] = pd.to_datetime(df['due_date'])
df['effective_date'] = pd.to_datetime(df['effective_date'])
print(df.head())
print(df.columns)
#Index(['Unnamed: 0', 'Unnamed: 0.1', 'loan_status', 'Principal', 'terms',
#       'effective_date', 'due_date', 'age', 'education', 'Gender'],
#      dtype='object')




# =============================================================================
# Data visualization and pre-processing
# =============================================================================
df['loan_status'].value_counts()


import seaborn as sns


# 数据分布：性别，贷款金额，还款状态
bins = np.linspace(df.Principal.min(), df.Principal.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'Principal', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()

#df.groupby('Gender').count()
#df.groupby(['Gender', 'Principal', 'loan_status']).count()



# 数据分布：性别，年龄，还款状态
bins = np.linspace(df.age.min(), df.age.max(), 30)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'age', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()

#df.groupby(['Gender', 'age', 'loan_status']).count()





# =============================================================================
# Pre-processing: Feature selection/extraction
# =============================================================================
df['dayofweek'] = df['effective_date'].dt.dayofweek
bins = np.linspace(df.dayofweek.min(), df.dayofweek.max(), 8)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'dayofweek', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()

#df.groupby(['Gender', 'dayofweek', 'loan_status']).count()


#weekend = Fri.Sat.Sun
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
df.head()


# 逾期还款与性别有关吗？
df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)
#Gender  loan_status
#female  PAIDOFF        0.865385
#        COLLECTION     0.134615
#male    PAIDOFF        0.731293
#        COLLECTION     0.268707
#Name: loan_status, dtype: float64
#86 % of female pay there loans while only 73 % of males pay there loan


# 逾期还款与教育程度有关吗？
df.groupby(['education'])['loan_status'].value_counts(normalize=True)
#education             loan_status
#Bechalor              PAIDOFF        0.750000
#                      COLLECTION     0.250000
#High School or Below  PAIDOFF        0.741722
#                      COLLECTION     0.258278
#Master or Above       COLLECTION     0.500000
#                      PAIDOFF        0.500000
#college               PAIDOFF        0.765101
#                      COLLECTION     0.234899
#Name: loan_status, dtype: float64
# 似乎 Master or Above 的人群逾期还款严重
#(df['education']=='Master or Above').value_counts()
#False    344
#True       2
# 样本太少，不说明问题



# Gender --> [0,1]
# loan_status --> [0,1]
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
df['loan_status'].replace(to_replace=['PAIDOFF','COLLECTION'], value=[0,1],inplace=True)
df.head()





# =============================================================================
# Feature selection
# =============================================================================
#One Hot Encoding
df[['Principal','terms','age','Gender','education']].head()

Feature = df[['Principal','terms','age','Gender','weekend']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
Feature.drop(['Master or Above'], axis = 1,inplace=True)
Feature.head()

X = Feature
X[0:5]

y = df['loan_status'].values
y[0:5]


# =============================================================================
# Normalize Data
# =============================================================================
X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]





# =============================================================================
# Classification
# =============================================================================

# =============================================================================
# K Nearest Neighbor(KNN)
# =============================================================================

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

knn_X_train, knn_X_test, knn_y_train, knn_y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', knn_X_train.shape,  knn_y_train.shape)
print ('Test set:', knn_X_test.shape,  knn_y_test.shape)


Ks = 100
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(knn_X_train, knn_y_train)
    knn_yhat=neigh.predict(knn_X_test)
    mean_acc[n-1] = metrics.accuracy_score(knn_y_test, knn_yhat)
    std_acc[n-1]=np.std(knn_yhat==knn_y_test)/np.sqrt(knn_yhat.shape[0])

print(mean_acc)


#Plot model accuracy for Different number of Neighbors
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 


k = mean_acc.argmax()+1
loadKNN = KNeighborsClassifier(n_neighbors = k).fit(X, y)




# =============================================================================
# Decision Tree
# =============================================================================
from sklearn.tree import DecisionTreeClassifier
loanTree = DecisionTreeClassifier(criterion="entropy").fit(X, y)
#loanTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4).fit(X, y)






# =============================================================================
# Support Vector Machine
# =============================================================================
from sklearn import svm
loanSVM = svm.SVC(kernel='rbf').fit(X, y)
#loanSVM = svm.SVC(kernel='linear').fit(X, y)







# =============================================================================
# Logistic Regression
# =============================================================================
from sklearn.linear_model import LogisticRegression
loanLR = LogisticRegression(C=0.01, solver='sag').fit(X, y)
#loanLR = LogisticRegression(C=0.01, solver='liblinear').fit(X, y)







# =============================================================================
# Model Evaluation using Test set
# =============================================================================
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss


test_df = pd.read_csv('loan_test.csv')
test_df.head()

test_df['due_date'] = pd.to_datetime(df['due_date'])
test_df['effective_date'] = pd.to_datetime(df['effective_date'])
test_df['dayofweek'] = test_df['effective_date'].dt.dayofweek
test_df['weekend'] = test_df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
test_df.head()

test_df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
test_df['loan_status'].replace(to_replace=['PAIDOFF','COLLECTION'], value=[0,1],inplace=True)
test_df.head()

X_test = test_df[['Principal','terms','age','Gender','weekend']]
X_test = pd.concat([X_test,pd.get_dummies(test_df['education'])], axis=1)
X_test.drop(['Master or Above'], axis = 1,inplace=True)
X_test= preprocessing.StandardScaler().fit(X_test).transform(X_test)
X_test[0:5]

y_test = test_df['loan_status'].values
y_test[0:5]





# =============================================================================
# K Nearest Neighbor(KNN)
# =============================================================================

yhat = loadKNN.predict(X_test)

jaccardKNN = jaccard_similarity_score(y_test, yhat)
f1scoreKNN = f1_score(y_test, yhat, average='weighted')

print("KNN", jaccardKNN, f1scoreKNN)



# =============================================================================
# Decision Tree
# =============================================================================
yhat = loanTree.predict(X_test)

jaccardTree = jaccard_similarity_score(y_test, yhat)
f1scoreTree = f1_score(y_test, yhat, average='weighted')

print("Tree", jaccardTree, f1scoreTree)



# =============================================================================
# Support Vector Machine
# =============================================================================
yhat = loanSVM.predict(X_test)

jaccardSVM = jaccard_similarity_score(y_test, yhat)
f1scoreSVM = f1_score(y_test, yhat, average='weighted')

print("SVM", jaccardSVM, f1scoreSVM)




# =============================================================================
# Logistic Regression
# =============================================================================
yhat = loanLR.predict(X_test)
yhat_prob = loanLR.predict_proba(X_test)

jaccardLR = jaccard_similarity_score(y_test, yhat)
f1scoreLR = f1_score(y_test, yhat, average='weighted', labels=np.unique(yhat))
loglossLR = log_loss(y_test, yhat_prob)

print("LR", jaccardLR, f1scoreLR, loglossLR)




# =============================================================================
# Report
# =============================================================================
report = {'Algorithm': ['KNN', 'Decision Tree', 'SVM', 'Logistic Regression'], 
          'Jaccard': [jaccardKNN, jaccardTree, jaccardSVM, jaccardLR],
          'F1-score': [f1scoreKNN, f1scoreTree, f1scoreSVM, f1scoreLR],
          'LogLoss': ['NA', 'NA', 'NA', loglossLR] }
df_report = pd.DataFrame(data=report)
df_report.set_index('Algorithm', inplace=True)
print(df_report)
