In [1]:
import turicreate

In [2]:
loans = turicreate.SFrame('../data/lending-club-data.sframe/')

In [3]:
loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [5]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [6]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print(f"Number of safe loans  : {len(safe_loans_raw)}")
print(f"Number of risky loans : {len(risky_loans_raw)}")

Number of safe loans  : 99457
Number of risky loans : 23150


In [7]:
totalLoans = len(safe_loans_raw) + len(risky_loans_raw)
safe_loans_raw_percentage = len(safe_loans_raw)/totalLoans * 100
risky_loans_raw_percentage = len(risky_loans_raw)/totalLoans * 100
# print(total_loans)
print(f"Percentage of safe loans  : {safe_loans_raw_percentage} %",) 
print(f"Percentage of risky loans : {risky_loans_raw_percentage} %",)

Percentage of safe loans  : 81.11853319957262 %
Percentage of risky loans : 18.881466800427383 %


In [8]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [9]:
print(f"Percentage of safe loans                 : {len(safe_loans) / float(len(loans_data)) *100} %")
print(f"Percentage of risky loans                : {len(risky_loans) / float(len(loans_data)) * 100} %")
print(f"Total number of loans in our new dataset : {len(loans_data)}")

Percentage of safe loans                 : 50.22361744216048 %
Percentage of risky loans                : 49.77638255783951 %
Total number of loans in our new dataset : 46508


In [10]:
#spilt 80% train_data and 20% validation_data
train_data, validation_data = loans_data.random_split(.8, seed=1)

In [11]:
def printModelEvaluation(model):
    
    print("-------------------------ACCURACY-------------------------")
    accuracytTrainScore = model.evaluate(train_data)['accuracy']
    accuracyValidationScore = model.evaluate(validation_data)['accuracy']

    print(f'{model._name()} Train Accuracy Score Score is: {accuracytTrainScore * 100} %')
    print(f'{model._name()} Validation Accuracy Score is: {accuracyValidationScore * 100} %')
    
    print("-------------------------PRECISION-------------------------")
    precisionTrainScore = model.evaluate(train_data)["precision"]
    precisionValidationScore = model.evaluate(validation_data)["precision"]

    print(f'{model._name()} Train Precision Score is: {precisionTrainScore * 100} %')
    print(f'{model._name()} Validation Precision Score is: {precisionValidationScore * 100} %')
    
    print("-------------------------RECALL-------------------------")
    recallTrainScore = model.evaluate(train_data)["recall"]
    recallValidationScore = model.evaluate(validation_data)["recall"]

    print(f'{model._name()} Train Recall Score Score is: {recallTrainScore * 100} %')
    print(f'{model._name()} Validation Recall Score is: {recallValidationScore * 100} %')
        
        

BEGIN MODEL CREATE AND EVALUATE


In [12]:
#create and train according to logisticRegression Algorithm with train data and specify which is the target and features
logisticRegression = turicreate.logistic_classifier.create(train_data, target=target, features=features, validation_set=None)

In [13]:
#print out metric score for each data set
printModelEvaluation(logisticRegression)

-------------------------ACCURACY-------------------------
LogisticClassifier Train Accuracy Score Score is: 64.59542230818826 %
LogisticClassifier Validation Accuracy Score is: 64.27186557518311 %
-------------------------PRECISION-------------------------
LogisticClassifier Train Precision Score is: 64.74450622186922 %
LogisticClassifier Validation Precision Score is: 63.960267760742816 %
-------------------------RECALL-------------------------
LogisticClassifier Train Recall Score Score is: 65.21762321314274 %
LogisticClassifier Validation Recall Score is: 64.25162689804772 %


In [14]:
#create and train according to Support Vector Machine Algorithm with train data and specify which is the target and features
supportVectorMachine = turicreate.svm_classifier.create(train_data, target=target, features=features, validation_set=None)

In [15]:
#printout metric score for each data set
printModelEvaluation(supportVectorMachine)

-------------------------ACCURACY-------------------------
SVMClassifier Train Accuracy Score Score is: 62.79550827423168 %
SVMClassifier Validation Accuracy Score is: 62.63464024127531 %
-------------------------PRECISION-------------------------
SVMClassifier Train Precision Score is: 63.58493705285342 %
SVMClassifier Validation Precision Score is: 63.010262257696695 %
-------------------------RECALL-------------------------
SVMClassifier Train Recall Score Score is: 61.15318967356518 %
SVMClassifier Validation Recall Score is: 59.9349240780911 %


In [16]:
#create and train according to Boosted Tree Algorithm with train data and specify which is the target and features
boostedTree = turicreate.boosted_trees_classifier.create(train_data, target=target, features=features, validation_set=None)

In [17]:
#printout metric score for each data set
printModelEvaluation(boostedTree)

-------------------------ACCURACY-------------------------
BoostedTreesClassifier Train Accuracy Score Score is: 65.85267569310122 %
BoostedTreesClassifier Validation Accuracy Score is: 64.57345971563981 %
-------------------------PRECISION-------------------------
BoostedTreesClassifier Train Precision Score is: 65.72381101213732 %
BoostedTreesClassifier Validation Precision Score is: 64.21959095801938 %
-------------------------RECALL-------------------------
BoostedTreesClassifier Train Recall Score Score is: 67.2978451034777 %
BoostedTreesClassifier Validation Recall Score is: 64.70715835140997 %


In [18]:
#create and train according to Random Forest Algorithm with train data and specify which is the target and features
randomForest = turicreate.random_forest_classifier.create(train_data, target=target, features=features, validation_set=None)

In [19]:
#printout metric score for each data set
printModelEvaluation(randomForest)

-------------------------ACCURACY-------------------------
RandomForestClassifier Train Accuracy Score Score is: 64.43423597678917 %
RandomForestClassifier Validation Accuracy Score is: 63.93795777682033 %
-------------------------PRECISION-------------------------
RandomForestClassifier Train Precision Score is: 64.78608620967309 %
RandomForestClassifier Validation Precision Score is: 64.07225691347011 %
-------------------------RECALL-------------------------
RandomForestClassifier Train Recall Score Score is: 64.37486665244293 %
RandomForestClassifier Validation Recall Score is: 62.32104121475054 %


References:
+ https://github.com/aKhfagy/machine-learning-specialization
+ https://github.com/anjali-2504/Classification_University_of_washington_Course3_Machine_Learning_All_Weeks_folders_turicreate_
+ https://www.section.io/engineering-education/how-to-create-a-machine-learning-app-using-turicreate/
+ https://apple.github.io/turicreate/docs/api/turicreate.toolkits.html
+ https://youtube.com/playlist?list=PL7yh-TELLS1EZGz1-VDltwdwZvPV-jliQ