In [143]:
import turicreate

In [144]:
loans = turicreate.SFrame('../data/lending-club-data.sframe/')

In [145]:
loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

In [146]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [147]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'annual_inc',
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [148]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print(f"Number of safe loans  : {len(safe_loans_raw)}")
print(f"Number of risky loans : {len(risky_loans_raw)}")

Number of safe loans  : 99457
Number of risky loans : 23150


In [149]:
totalLoans = len(safe_loans_raw) + len(risky_loans_raw)
safe_loans_raw_percentage = len(safe_loans_raw)/totalLoans * 100
risky_loans_raw_percentage = len(risky_loans_raw)/totalLoans * 100
# print(total_loans)
print(f"Percentage of safe loans  : {safe_loans_raw_percentage} %",) 
print(f"Percentage of risky loans : {risky_loans_raw_percentage} %",)

Percentage of safe loans  : 81.11853319957262 %
Percentage of risky loans : 18.881466800427383 %


In [150]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [151]:
print(f"Percentage of safe loans                 : {len(safe_loans) / float(len(loans_data)) *100} %")
print(f"Percentage of risky loans                : {len(risky_loans) / float(len(loans_data)) * 100} %")
print(f"Total number of loans in our new dataset : {len(loans_data)}")

Percentage of safe loans                 : 50.22361744216048 %
Percentage of risky loans                : 49.77638255783951 %
Total number of loans in our new dataset : 46508


In [152]:
#spilt 80% train_data and 20% validation_data
train_data, validation_data = loans_data.random_split(.8, seed=1)

In [153]:
def printModelEvaluation(model):
    
    print("-------------------------ACCURACY-------------------------")
    accuracytTrainScore = model.evaluate(train_data)['accuracy']
    accuracyValidationScore = model.evaluate(validation_data)['accuracy']

    print(f'{model._name()} Train Accuracy Score Score is: {accuracytTrainScore * 100} %')
    print(f'{model._name()} Validation Accuracy Score is: {accuracyValidationScore * 100} %')
    
    print("-------------------------PRECISION-------------------------")
    precisionTrainScore = model.evaluate(train_data)["precision"]
    precisionValidationScore = model.evaluate(validation_data)["precision"]

    print(f'{model._name()} Train Precision Score is: {precisionTrainScore * 100} %')
    print(f'{model._name()} Validation Precision Score is: {precisionValidationScore * 100} %')
    
    print("-------------------------RECALL-------------------------")
    recallTrainScore = model.evaluate(train_data)["recall"]
    recallValidationScore = model.evaluate(validation_data)["recall"]

    print(f'{model._name()} Train Recall Score Score is: {recallTrainScore * 100} %')
    print(f'{model._name()} Validation Recall Score is: {recallValidationScore * 100} %')
    
    print("-------------------------F1SCORE-------------------------")
    f1TrainPredict = decisionTree.predict(train_data)
    f1ValidationPredict = decisionTree.predict(validation_data)
    
    f1TrainScoreMacro = turicreate.evaluation.f1_score(targets=train_data['safe_loans'], predictions=f1TrainPredict, average="macro")
    f1ValidationScoreMacro = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=f1ValidationPredict, average="macro")
    
    f1TrainScoreMicro = turicreate.evaluation.f1_score(targets=train_data['safe_loans'], predictions=f1TrainPredict, average="micro")
    f1ValidationScoreMicro = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=f1ValidationPredict, average="micro")
    
    f1TrainScoreNone = turicreate.evaluation.f1_score(targets=train_data['safe_loans'], predictions=f1TrainPredict, average=None)
    f1ValidationScoreNone = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=f1ValidationPredict, average=None)
    
    # OR F1 = 2 * (precision * recall) / (precision + recall)
    
    
    print(f'{model._name()} Train F1 Score Macro is: {f1TrainScoreMacro}')
    print(f'{model._name()} Validation F1 Score Marco is: {f1ValidationScoreMacro}')
    
    print(f'{model._name()} Train F1 Score Micro is: {f1TrainScoreMicro}')
    print(f'{model._name()} Validation F1 Score Mirco is: {f1ValidationScoreMicro}')
    
    print(f'{model._name()} Train F1 Score None is: {f1TrainScoreNone}')
    print(f'{model._name()} Validation F1 Score None is: {f1ValidationScoreNone}')
        
        

In [154]:
def compareModels(decisionTree, logistic, svm, randomForest):
    
    import pandas as pd
    # pd.set_option('display.expand_frame_repr', False)
    
    
    accuracyDecisionTree = decisionTree.evaluate(validation_data)['accuracy']
    accuracyLogistic = logistic.evaluate(validation_data)['accuracy']
    accuracySVM = svm.evaluate(validation_data)['accuracy']
    accuracyRandomForest = randomForest.evaluate(validation_data)['accuracy']
    
    precisionDecisionTree = decisionTree.evaluate(validation_data)["precision"]
    precisionLogistic = logistic.evaluate(validation_data)["precision"]
    precisionSVM = svm.evaluate(validation_data)["precision"]
    precisionRandomForest = randomForest.evaluate(validation_data)["precision"]
    
    recallDecisionTree = decisionTree.evaluate(validation_data)["recall"]
    recallLogistic = logistic.evaluate(validation_data)["recall"]
    recallSVM = svm.evaluate(validation_data)["recall"]
    recallRandomForest = randomForest.evaluate(validation_data)["recall"]
    
    decisionTreePredictions = decisionTree.predict(validation_data)
    logisticPredictions = logistic.predict(validation_data)
    svmPredictions = svm.predict(validation_data)
    randomForestPredictions = randomForest.predict(validation_data)
    
    decisionTreeF1Score = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=decisionTreePredictions, average=None)
    logisticF1Score = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=logisticPredictions, average=None)
    svmF1Score = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=svmPredictions, average=None)
    randomForestF1Score = turicreate.evaluation.f1_score(targets=validation_data['safe_loans'], predictions=randomForestPredictions, average=None)
    
    modelsAccuracy = {
        'Decision Tree': [round(accuracyDecisionTree * 100, 2)],
        'Logistic Regression': [round(accuracyLogistic * 100, 2)],
        'Support Vector Machine': [round(accuracySVM * 100, 2)],
        'Random Forest': [round(accuracyRandomForest * 100, 2)]}
    
    accuracyTable = pd.DataFrame(data = modelsAccuracy, index = ["Accuracy (%)"])
    
    modelsPrecision = {
        'Decision Tree': [round(precisionDecisionTree * 100, 2)],
        'Logistic Regression': [round(precisionLogistic * 100, 2)],
        'Support Vector Machine': [round(precisionSVM * 100, 2)],
        'Random Forest': [round(precisionRandomForest * 100, 2)]}
    
    precisionTable = pd.DataFrame(data = modelsPrecision, index = ["Precision (%)"])

    modelsRecall = {
        'Decision Tree': [round(recallDecisionTree * 100, 2)],
        'Logistic Regression': [round(recallLogistic * 100, 2)],
        'Support Vector Machine': [round(recallSVM * 100, 2)],
        'Random Forest': [round(recallRandomForest * 100, 2)]}
    
    recallTable = pd.DataFrame(data = modelsRecall, index = ["Recall    (%)"])
    
    modelsF1Score = {
        'Decision Tree': [round(decisionTreeF1Score, 2)],
        'Logistic Regression': [round(logisticF1Score, 2)],
        'Support Vector Machine': [round(svmF1Score, 2)],
        'Random Forest': [round(randomForestF1Score, 2)]}
    
    f1ScoreTable = pd.DataFrame(data = modelsF1Score, index = ["F1 Score"])
    
    display(accuracyTable)
    display(precisionTable)
    display(f1ScoreTable)
    display(recallTable)

BEGIN MODEL CREATE AND EVALUATE


In [155]:
decisionTree = turicreate.decision_tree_classifier.create(train_data, target = target, features = features, validation_set=None)

In [156]:
printModelEvaluation(decisionTree)

-------------------------ACCURACY-------------------------
DecisionTreeClassifier Train Accuracy Score Score is: 64.23006662368364 %
DecisionTreeClassifier Validation Accuracy Score is: 63.63636363636363 %
-------------------------PRECISION-------------------------
DecisionTreeClassifier Train Precision Score is: 69.02977232924692 %
DecisionTreeClassifier Validation Precision Score is: 67.78097982708934 %
-------------------------RECALL-------------------------
DecisionTreeClassifier Train Recall Score Score is: 52.56027309579688 %
DecisionTreeClassifier Validation Recall Score is: 51.01952277657267 %
-------------------------F1SCORE-------------------------
DecisionTreeClassifier Train F1 Score Macro is: 0.5967961723647155
DecisionTreeClassifier Validation F1 Score Marco is: 0.5821782178217823
DecisionTreeClassifier Train F1 Score Micro is: 0.5967961723647155
DecisionTreeClassifier Validation F1 Score Mirco is: 0.5821782178217823
DecisionTreeClassifier Train F1 Score None is: 0.596796

In [157]:
#create and train according to logisticRegression Algorithm with train data and specify which is the target and features
logisticRegression = turicreate.logistic_classifier.create(train_data, target=target, features=features, validation_set=None)

In [158]:
#print out metric score for each data set
printModelEvaluation(logisticRegression)

-------------------------ACCURACY-------------------------
LogisticClassifier Train Accuracy Score Score is: 65.42553191489363 %
LogisticClassifier Validation Accuracy Score is: 64.72425678586816 %
-------------------------PRECISION-------------------------
LogisticClassifier Train Precision Score is: 65.72330408731007 %
LogisticClassifier Validation Precision Score is: 64.58378850775617 %
-------------------------RECALL-------------------------
LogisticClassifier Train Recall Score Score is: 65.52698954555153 %
LogisticClassifier Validation Recall Score is: 64.12147505422993 %
-------------------------F1SCORE-------------------------
LogisticClassifier Train F1 Score Macro is: 0.5967961723647155
LogisticClassifier Validation F1 Score Marco is: 0.5821782178217823
LogisticClassifier Train F1 Score Micro is: 0.5967961723647155
LogisticClassifier Validation F1 Score Mirco is: 0.5821782178217823
LogisticClassifier Train F1 Score None is: 0.5967961723647155
LogisticClassifier Validation F1 

In [159]:
#create and train according to Support Vector Machine Algorithm with train data and specify which is the target and features
supportVectorMachine = turicreate.svm_classifier.create(train_data, target=target, features=features, validation_set=None)

In [160]:
#printout metric score for each data set
printModelEvaluation(supportVectorMachine)

-------------------------ACCURACY-------------------------
SVMClassifier Train Accuracy Score Score is: 63.27638082957232 %
SVMClassifier Validation Accuracy Score is: 63.06548901335631 %
-------------------------PRECISION-------------------------
SVMClassifier Train Precision Score is: 63.506756037876364 %
SVMClassifier Validation Precision Score is: 62.924053403370536 %
-------------------------RECALL-------------------------
SVMClassifier Train Recall Score Score is: 63.676125453381694 %
SVMClassifier Validation Recall Score is: 62.364425162689805 %
-------------------------F1SCORE-------------------------
SVMClassifier Train F1 Score Macro is: 0.5967961723647155
SVMClassifier Validation F1 Score Marco is: 0.5821782178217823
SVMClassifier Train F1 Score Micro is: 0.5967961723647155
SVMClassifier Validation F1 Score Mirco is: 0.5821782178217823
SVMClassifier Train F1 Score None is: 0.5967961723647155
SVMClassifier Validation F1 Score None is: 0.5821782178217823


In [161]:
#create and train according to Random Forest Algorithm with train data and specify which is the target and features
randomForest = turicreate.random_forest_classifier.create(train_data, target=target, features=features, validation_set=None)

In [162]:
#printout metric score for each data set
printModelEvaluation(randomForest)

-------------------------ACCURACY-------------------------
RandomForestClassifier Train Accuracy Score Score is: 65.27777777777779 %
RandomForestClassifier Validation Accuracy Score is: 64.45497630331754 %
-------------------------PRECISION-------------------------
RandomForestClassifier Train Precision Score is: 66.51540076011118 %
RandomForestClassifier Validation Precision Score is: 65.4554034922133 %
-------------------------RECALL-------------------------
RandomForestClassifier Train Recall Score Score is: 62.545338169404744 %
RandomForestClassifier Validation Recall Score is: 60.17353579175705 %
-------------------------F1SCORE-------------------------
RandomForestClassifier Train F1 Score Macro is: 0.5967961723647155
RandomForestClassifier Validation F1 Score Marco is: 0.5821782178217823
RandomForestClassifier Train F1 Score Micro is: 0.5967961723647155
RandomForestClassifier Validation F1 Score Mirco is: 0.5821782178217823
RandomForestClassifier Train F1 Score None is: 0.596796

In [163]:
compareModels(decisionTree, logisticRegression, supportVectorMachine, randomForest)

Unnamed: 0,Decision Tree,Logistic Regression,Support Vector Machine,Random Forest
Accuracy (%),63.64,64.72,63.07,64.45


Unnamed: 0,Decision Tree,Logistic Regression,Support Vector Machine,Random Forest
Precision (%),67.78,64.58,62.92,65.46


Unnamed: 0,Decision Tree,Logistic Regression,Support Vector Machine,Random Forest
F1 Score,0.58,0.64,0.63,0.63


Unnamed: 0,Decision Tree,Logistic Regression,Support Vector Machine,Random Forest
Recall (%),51.02,64.12,62.36,60.17


References:
+ https://github.com/aKhfagy/machine-learning-specialization
+ https://github.com/anjali-2504/Classification_University_of_washington_Course3_Machine_Learning_All_Weeks_folders_turicreate_
+ https://www.section.io/engineering-education/how-to-create-a-machine-learning-app-using-turicreate/
+ https://apple.github.io/turicreate/docs/api/turicreate.toolkits.html
+ https://youtube.com/playlist?list=PL7yh-TELLS1EZGz1-VDltwdwZvPV-jliQ