In [1]:
import turicreate

In [2]:
loans = turicreate.SFrame('../data/lending-club-data.sframe/')

In [3]:
loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [5]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [6]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print(f"Number of safe loans  : {len(safe_loans_raw)}")
print(f"Number of risky loans : {len(risky_loans_raw)}")

Number of safe loans  : 99457
Number of risky loans : 23150


In [7]:
totalLoans = len(safe_loans_raw) + len(risky_loans_raw)
safe_loans_raw_percentage = len(safe_loans_raw)/totalLoans * 100
risky_loans_raw_percentage = len(risky_loans_raw)/totalLoans * 100
# print(total_loans)
print(f"Percentage of safe loans  : {safe_loans_raw_percentage} %",) 
print(f"Percentage of risky loans : {risky_loans_raw_percentage} %",)

Percentage of safe loans  : 81.11853319957262 %
Percentage of risky loans : 18.881466800427383 %


In [8]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [9]:
print(f"Percentage of safe loans                 : {len(safe_loans) / float(len(loans_data)) *100} %")
print(f"Percentage of risky loans                : {len(risky_loans) / float(len(loans_data)) * 100} %")
print(f"Total number of loans in our new dataset : {len(loans_data)}")

Percentage of safe loans                 : 50.22361744216048 %
Percentage of risky loans                : 49.77638255783951 %
Total number of loans in our new dataset : 46508


In [10]:
#spilt 80% train_data and 20% validation_data
train_data, validation_data = loans_data.random_split(.8, seed=1)

In [11]:
#create and train according to logisticRegression Algorithm with train data and specify which is the target and features
logisticRegression = turicreate.logistic_classifier.create(train_data, target=target, features=features, validation_set=None)

In [12]:
#printout metric accuracy score for each data set
logisticTrainScore = logisticRegression.evaluate(train_data)['accuracy']
logisticValidationScore = logisticRegression.evaluate(validation_data)['accuracy']
print(f'Logistic Train Score is: {logisticTrainScore * 100} %')
print(f'Logistic Validation Score is: {logisticValidationScore * 100} %')

Logistic Train Score is: 64.59542230818826 %
Logistic Validation Score is: 64.27186557518311 %


In [13]:
#create and train according to KNearest Algorithm with train data and specify which is the target and features
KNearest = turicreate.nearest_neighbor_classifier.create(train_data, target=target, features=features)

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [14]:
#printout metric accuracy score for each data set
KNearestTrainScore = KNearest.evaluate(train_data, max_neighbors=3)['accuracy']
KNearestValidationScore = KNearest.evaluate(validation_data, max_neighbors=3)['accuracy']
print(f'KNearest Train Score is: {KNearestTrainScore * 100} %')
print(f'KNearest Validation Score is: {KNearestValidationScore * 100} %')

KNearest Train Score is: 77.36406619385343 %
KNearest Validation Score is: 56.171908660060325 %


In [15]:
#create and train according to Support Vector Machine Algorithm with train data and specify which is the target and features
supportVectorMachine = turicreate.svm_classifier.create(train_data, target=target, features=features, validation_set=None)

In [16]:
#printout metric accuracy score for each data set
SVMTrainScore = supportVectorMachine.evaluate(train_data)['accuracy']
SVMValidationScore = supportVectorMachine.evaluate(validation_data)['accuracy']
print(f'SVM Train Score Score is: {SVMTrainScore * 100} %')
print(f'SVM Validation Score is: {SVMValidationScore * 100} %')

SVM Train Score Score is: 62.79550827423168 %
SVM Validation Score is: 62.63464024127531 %


In [17]:
#create and train according to Boosted Tree Algorithm with train data and specify which is the target and features
boostedTree = turicreate.boosted_trees_classifier.create(train_data, target=target, features=features, validation_set=None)

In [18]:
#printout metric accuracy score for each data set
boostedTreeTrainScore = boostedTree.evaluate(train_data)['accuracy']
boostedTreeValidationScore = boostedTree.evaluate(validation_data)['accuracy']
print(f'Boosted Tree Train Score Score is: {boostedTreeTrainScore * 100} %')
print(f'Boosted Tree Validation Score is: {boostedTreeValidationScore * 100} %')

Boosted Tree Train Score Score is: 65.85267569310122 %
Boosted Tree Validation Score is: 64.57345971563981 %


In [19]:
#create and train according to Random Forest Algorithm with train data and specify which is the target and features
randomForest = turicreate.random_forest_classifier.create(train_data, target=target, features=features, validation_set=None)

In [20]:
#printout metric accuracy score for each data set
randomForestTrainScore = randomForest.evaluate(train_data)['accuracy']
randomForestValidationScore = randomForest.evaluate(validation_data)['accuracy']
print(f'Random Forest Train Score Score is: {randomForestTrainScore * 100} %')
print(f'Random Forest Validation Score is: {randomForestValidationScore * 100} %')

Random Forest Train Score Score is: 64.43423597678917 %
Random Forest Validation Score is: 63.93795777682033 %


References:
+ https://github.com/aKhfagy/machine-learning-specialization
+ https://github.com/anjali-2504/Classification_University_of_washington_Course3_Machine_Learning_All_Weeks_folders_turicreate_
+ https://www.section.io/engineering-education/how-to-create-a-machine-learning-app-using-turicreate/
+ https://apple.github.io/turicreate/docs/api/turicreate.toolkits.html
+ https://youtube.com/playlist?list=PL7yh-TELLS1EZGz1-VDltwdwZvPV-jliQ