In [463]:
import os, pandas as pd, numpy as np

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
import matplotlib.pyplot as plt


In [None]:
from scipy import stats

In [None]:
loans = pd.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False)
loans.head()


Data Cleaning 

In [468]:
loans.shape

(42540, 145)

In [469]:
# explore missing values
loans.isnull().sum()

id                                            42537
member_id                                     42540
loan_amnt                                         5
funded_amnt                                       5
funded_amnt_inv                                   5
term                                              5
int_rate                                          5
installment                                       5
grade                                             5
sub_grade                                         5
emp_title                                      2631
emp_length                                     1117
home_ownership                                    5
annual_inc                                        9
verification_status                               5
issue_d                                           5
loan_status                                       5
pymnt_plan                                        5
url                                           42540
desc        

In [470]:
# drop columns with more than 50% of missing values
half = len(loans) / 2
loans = loans.dropna(thresh=half, axis=1)
loans.isnull().sum()

loan_amnt                         5
funded_amnt                       5
funded_amnt_inv                   5
term                              5
int_rate                          5
installment                       5
grade                             5
sub_grade                         5
emp_title                      2631
emp_length                     1117
home_ownership                    5
annual_inc                        9
verification_status               5
issue_d                           5
loan_status                       5
pymnt_plan                        5
desc                          13298
purpose                           5
title                            18
zip_code                          5
addr_state                        5
dti                               5
delinq_2yrs                      34
earliest_cr_line                 34
inq_last_6mths                   34
open_acc                         34
pub_rec                          34
revol_bal                   

In [471]:
# drop columns that leak information from the future (information after the loan is already started to be funded)
loans = loans.drop(['funded_amnt','funded_amnt_inv','issue_d','out_prncp','out_prncp_inv','total_pymnt',
                    'total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries',
                    'collection_recovery_fee','last_pymnt_d','last_pymnt_amnt','disbursement_method'], axis=1)

# drop debt_settlement_flag column since there is no info on it in the data dictionary
loans = loans.drop('debt_settlement_flag', axis=1)

In [472]:
# drop columns that contain redundant information
# grade and sub_grade are assigned grade based on the borrower's interest rate, therefore redundant as int_rate
# zip_code is redundant with addr_state as it only show first 3 digits
loans = loans.drop(['grade','sub_grade','zip_code'], axis=1)
loans.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,...,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,hardship_flag
0,5000.0,36 months,10.65%,162.87,,10+ years,RENT,24000.0,Verified,Fully Paid,...,Sep-18,0.0,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N
1,2500.0,60 months,15.27%,59.83,Ryder,< 1 year,RENT,30000.0,Source Verified,Charged Off,...,Oct-16,0.0,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N
2,2400.0,36 months,15.96%,84.33,,10+ years,RENT,12252.0,Not Verified,Fully Paid,...,Jun-17,0.0,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N
3,10000.0,36 months,13.49%,339.31,AIR RESOURCES BOARD,10+ years,RENT,49200.0,Source Verified,Fully Paid,...,Apr-16,0.0,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N
4,3000.0,60 months,12.69%,67.79,University Medical Group,1 year,RENT,80000.0,Source Verified,Fully Paid,...,Apr-18,0.0,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N


In [473]:
# explore number of unique values in each column
unique_values = loans.apply(pd.Series.nunique,axis=0)
unique_values

loan_amnt                       898
term                              2
int_rate                        394
installment                   16459
emp_title                     30658
emp_length                       11
home_ownership                    5
annual_inc                     5597
verification_status               3
loan_status                       4
pymnt_plan                        1
desc                          28963
purpose                          14
title                         21256
addr_state                       50
dti                            2894
delinq_2yrs                      12
earliest_cr_line                530
inq_last_6mths                   28
open_acc                         44
pub_rec                           6
revol_bal                     22709
revol_util                     1119
total_acc                        83
initial_list_status               1
last_credit_pull_d              134
collections_12_mths_ex_med        1
policy_code                 

In [474]:
# drop columns that contain only one unique value
cols_with_one_value = unique_values[unique_values == 1]
loans = loans.drop(cols_with_one_value.index, axis=1)
loans.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_credit_pull_d,acc_now_delinq,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,5000.0,36 months,10.65%,162.87,,10+ years,RENT,24000.0,Verified,Fully Paid,...,3.0,0.0,13648.0,83.70%,9.0,Sep-18,0.0,0.0,0.0,0.0
1,2500.0,60 months,15.27%,59.83,Ryder,< 1 year,RENT,30000.0,Source Verified,Charged Off,...,3.0,0.0,1687.0,9.40%,4.0,Oct-16,0.0,0.0,0.0,0.0
2,2400.0,36 months,15.96%,84.33,,10+ years,RENT,12252.0,Not Verified,Fully Paid,...,2.0,0.0,2956.0,98.50%,10.0,Jun-17,0.0,0.0,0.0,0.0
3,10000.0,36 months,13.49%,339.31,AIR RESOURCES BOARD,10+ years,RENT,49200.0,Source Verified,Fully Paid,...,10.0,0.0,5598.0,21%,37.0,Apr-16,0.0,0.0,0.0,0.0
4,3000.0,60 months,12.69%,67.79,University Medical Group,1 year,RENT,80000.0,Source Verified,Fully Paid,...,15.0,0.0,27783.0,53.90%,38.0,Apr-18,0.0,0.0,0.0,0.0


In [475]:
# explore text columns
loans.select_dtypes(include='object').apply(pd.Series.nunique,axis=0)

term                       2
int_rate                 394
emp_title              30658
emp_length                11
home_ownership             5
verification_status        3
loan_status                4
desc                   28963
purpose                   14
title                  21256
addr_state                50
earliest_cr_line         530
revol_util              1119
last_credit_pull_d       134
dtype: int64

In [476]:
# drop text columns emp_title, desc, and title since they have too many unique categorical values and too hard to process
loans = loans.drop(['emp_title','desc','title', 'addr_state','earliest_cr_line','last_credit_pull_d'],axis=1)
loans.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,...,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,acc_now_delinq,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,5000.0,36 months,10.65%,162.87,10+ years,RENT,24000.0,Verified,Fully Paid,credit_card,...,1.0,3.0,0.0,13648.0,83.70%,9.0,0.0,0.0,0.0,0.0
1,2500.0,60 months,15.27%,59.83,< 1 year,RENT,30000.0,Source Verified,Charged Off,car,...,5.0,3.0,0.0,1687.0,9.40%,4.0,0.0,0.0,0.0,0.0
2,2400.0,36 months,15.96%,84.33,10+ years,RENT,12252.0,Not Verified,Fully Paid,small_business,...,2.0,2.0,0.0,2956.0,98.50%,10.0,0.0,0.0,0.0,0.0
3,10000.0,36 months,13.49%,339.31,10+ years,RENT,49200.0,Source Verified,Fully Paid,other,...,1.0,10.0,0.0,5598.0,21%,37.0,0.0,0.0,0.0,0.0
4,3000.0,60 months,12.69%,67.79,1 year,RENT,80000.0,Source Verified,Fully Paid,other,...,0.0,15.0,0.0,27783.0,53.90%,38.0,0.0,0.0,0.0,0.0


 EDA & Feature Preparation

In [477]:
# explore target column
loans['loan_status'].value_counts()

Fully Paid                                             34116
Charged Off                                             5670
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Name: loan_status, dtype: int64

In [478]:
# remove loans that are not Fully Paid or Charged Off, sicne we are only interested in those two final outcomes
loans = loans[(loans.loan_status == 'Fully Paid') | (loans.loan_status == 'Charged Off')]

In [479]:
# transform the two labels to numerical values
loans = loans.replace('Fully Paid', 1)
loans = loans.replace('Charged Off', 0)
loans['loan_status'].value_counts()

1    34116
0     5670
Name: loan_status, dtype: int64

Obviously this is a case of imbalanced class problem, which we need to use some approach to handle this so that the model doesn't show bias when making predictions.

In [480]:
# handle missing values with the remaining columns
loans.isnull().sum()

loan_amnt                  0
term                       0
int_rate                   0
installment                0
emp_length              1078
home_ownership             0
annual_inc                 0
verification_status        0
loan_status                0
purpose                    0
dti                        0
delinq_2yrs                0
inq_last_6mths             0
open_acc                   0
pub_rec                    0
revol_bal                  0
revol_util                50
total_acc                  0
acc_now_delinq             0
delinq_amnt                0
pub_rec_bankruptcies     697
tax_liens                 39
dtype: int64

In [481]:
loans['emp_length'].value_counts(dropna=False)

10+ years    8899
< 1 year     4590
2 years      4394
3 years      4098
4 years      3444
5 years      3286
1 year       3247
6 years      2231
7 years      1775
8 years      1485
9 years      1259
NaN          1078
Name: emp_length, dtype: int64

In [482]:
# convert emp_length to numeric columns, assuming NaN is 0 year experience
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loans = loans.replace(mapping_dict)


In [483]:
# drop pub_rec_bankruptcies column and for other columns with null values, remove the rows with null values
loans = loans.drop('pub_rec_bankruptcies', axis=1)
loans = loans.dropna(axis=0)
loans.isnull().sum()

loan_amnt              0
term                   0
int_rate               0
installment            0
emp_length             0
home_ownership         0
annual_inc             0
verification_status    0
loan_status            0
purpose                0
dti                    0
delinq_2yrs            0
inq_last_6mths         0
open_acc               0
pub_rec                0
revol_bal              0
revol_util             0
total_acc              0
acc_now_delinq         0
delinq_amnt            0
tax_liens              0
dtype: int64

In [484]:
# explore remaining text columns
loans.select_dtypes(include='object').apply(pd.Series.nunique,axis=0)

term                      2
int_rate                371
home_ownership            5
verification_status       3
purpose                  14
revol_util             1087
dtype: int64

In [485]:
# convert the int_rate column and revol_util column to numeric columns
loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype('float')
loans['revol_util'] = loans['revol_util'].str.rstrip('%').astype('float')

Feature Selection

In [486]:
# correlation
corr = loans.corr()['loan_status'].sort_values()
corr

int_rate         -0.199483
revol_util       -0.096502
inq_last_6mths   -0.070063
loan_amnt        -0.053438
pub_rec          -0.050145
dti              -0.039732
installment      -0.026415
delinq_2yrs      -0.019114
emp_length       -0.013178
revol_bal        -0.005531
open_acc          0.006588
total_acc         0.020481
annual_inc        0.038507
loan_status       1.000000
acc_now_delinq         NaN
delinq_amnt            NaN
tax_liens              NaN
Name: loan_status, dtype: float64

In [487]:
unique_values = loans.apply(pd.Series.nunique,axis=0)
unique_values

loan_amnt                879
term                       2
int_rate                 371
installment            15130
emp_length                11
home_ownership             5
annual_inc              5099
verification_status        3
loan_status                2
purpose                   14
dti                     2863
delinq_2yrs               11
inq_last_6mths             9
open_acc                  40
pub_rec                    5
revol_bal              21448
revol_util              1087
total_acc                 82
acc_now_delinq             1
delinq_amnt                1
tax_liens                  1
dtype: int64

In [488]:
loans = loans.drop(['acc_now_delinq','delinq_amnt','tax_liens'],axis=1)

In [489]:
# t-test on numerical variables 
numeric_cols = loans.select_dtypes(include=['float','int']).columns.values
new_numeric_cols = np.delete(numeric_cols, 5)
new_numeric_cols

array(['loan_amnt', 'int_rate', 'installment', 'emp_length', 'annual_inc',
       'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'revol_util', 'total_acc'], dtype=object)

In [490]:
safe = loans[(loans['loan_status'] == 1)]
bad = loans[(loans['loan_status'] == 0)]
diff_dic = {}
for var in new_numeric_cols:
    diff_dic[var] = stats.ttest_ind(safe[var], bad[var])
diff_dic

{'annual_inc': Ttest_indResult(statistic=7.572916143606543, pvalue=3.7305751422035e-14),
 'delinq_2yrs': Ttest_indResult(statistic=-3.7570587309868895, pvalue=0.00017217474741857504),
 'dti': Ttest_indResult(statistic=-7.814285847154388, pvalue=5.666893232754493e-15),
 'emp_length': Ttest_indResult(statistic=-2.5899012617360357, pvalue=0.00960395024882877),
 'inq_last_6mths': Ttest_indResult(statistic=-13.802646104816883, pvalue=3.112546107756176e-43),
 'installment': Ttest_indResult(statistic=-5.192955638749842, pvalue=2.080292709934631e-07),
 'int_rate': Ttest_indResult(statistic=-40.006493721984114, pvalue=0.0),
 'loan_amnt': Ttest_indResult(statistic=-10.516571049218262, pvalue=7.853381361037408e-26),
 'open_acc': Ttest_indResult(statistic=1.2947171818497778, pvalue=0.19542555107453233),
 'pub_rec': Ttest_indResult(statistic=-9.86696528698005, pvalue=6.162348518445173e-23),
 'revol_bal': Ttest_indResult(statistic=-1.0869862874410325, pvalue=0.27704965387036734),
 'revol_util': Ttes

In [491]:
loans = loans.drop(['revol_bal','open_acc','emp_length'],axis=1)

In [492]:
# anova analysis on loan_status across different categorical groups

home_grps = pd.unique(loans.home_ownership.values)
h_loans = {grp:loans['loan_status'][loans.home_ownership == grp] for grp in home_grps}
F, p = stats.f_oneway(h_loans['RENT'], h_loans['OWN'], h_loans['MORTGAGE'], h_loans['OTHER'], h_loans['NONE'])
F, p

(5.875857155251695, 0.00010073325836398587)

In [493]:
term_grps = pd.unique(loans.term.values)
term_grps
t_loans = {grp:loans['loan_status'][loans.term == grp] for grp in term_grps}
F, p = stats.f_oneway(t_loans[' 36 months'], t_loans[' 60 months'])
F, p

(904.434413050644, 1.9826592453961934e-196)

In [494]:
verif_grps = pd.unique(loans.verification_status.values)
v_loans = {grp:loans['loan_status'][loans.verification_status == grp] for grp in verif_grps}
F, p = stats.f_oneway(v_loans['Verified'], v_loans['Not Verified'], v_loans['Source Verified'])
F, p

(36.605876859953845, 1.310145934803636e-16)

In [495]:
purp_grps = pd.unique(loans.purpose.values)
p_loans = {grp:loans['loan_status'][loans.purpose == grp] for grp in purp_grps}
F, p = stats.f_oneway(p_loans['credit_card'], p_loans['car'], p_loans['small_business'], p_loans['other'], 
                      p_loans['wedding'], p_loans['debt_consolidation'], p_loans['home_improvement'], 
                      p_loans['major_purchase'], p_loans['medical'], p_loans['moving'], 
                      p_loans['vacation'], p_loans['house'], p_loans['renewable_energy'], 
                     p_loans['educational'])
F, p

(28.91506078986062, 6.428887105144201e-72)

In [496]:
loans

Unnamed: 0,loan_amnt,term,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,delinq_2yrs,inq_last_6mths,pub_rec,revol_util,total_acc
0,5000.0,36 months,10.65,162.87,RENT,24000.00,Verified,1,credit_card,27.65,0.0,1.0,0.0,83.70,9.0
1,2500.0,60 months,15.27,59.83,RENT,30000.00,Source Verified,0,car,1.00,0.0,5.0,0.0,9.40,4.0
2,2400.0,36 months,15.96,84.33,RENT,12252.00,Not Verified,1,small_business,8.72,0.0,2.0,0.0,98.50,10.0
3,10000.0,36 months,13.49,339.31,RENT,49200.00,Source Verified,1,other,20.00,0.0,1.0,0.0,21.00,37.0
4,3000.0,60 months,12.69,67.79,RENT,80000.00,Source Verified,1,other,17.94,0.0,0.0,0.0,53.90,38.0
5,5000.0,36 months,7.90,156.46,RENT,36000.00,Source Verified,1,wedding,11.20,0.0,3.0,0.0,28.30,12.0
6,7000.0,60 months,15.96,170.08,RENT,47004.00,Not Verified,1,debt_consolidation,23.51,0.0,1.0,0.0,85.60,11.0
7,3000.0,36 months,18.64,109.43,RENT,48000.00,Source Verified,1,car,5.35,0.0,2.0,0.0,87.50,4.0
8,5600.0,60 months,21.28,152.39,OWN,40000.00,Source Verified,0,small_business,5.55,0.0,2.0,0.0,32.60,13.0
9,5375.0,60 months,12.69,121.45,RENT,15000.00,Verified,0,other,18.08,0.0,0.0,0.0,36.50,3.0


In [497]:
# create dummy variables for the remaining categorical variables in the dataframe
dummy_df = pd.get_dummies(loans[['home_ownership','verification_status','purpose','term']])
loans = pd.concat([loans,dummy_df],axis=1)
loans = loans.drop(['home_ownership','verification_status','purpose','term'],axis=1)

Modeling 

In [518]:
# Split data
train = loans.iloc[0:int(loans.shape[0] * 0.75)]
test = loans.iloc[int(loans.shape[0] * 0.75):]

Set up error metric

Due to imbalanced class, we want to use precision & recall to measure performance instead of accuracy. From the investor's standpoint, we want to accurately identify those who can't pay off the loans (negatives), so we want keep false positive rate as low as possible and also optimize for recall. 


In [553]:
# set up error metric 
def error_metric(predictions):
    tp_filter = (predictions==1) & (test['loan_status']==1)
    tp = float(len(predictions[tp_filter]))
    
    fn_filter = (predictions==0) & (test['loan_status']==1)
    fn = float(len(predictions[fn_filter]))
    
    fp_filter = (predictions==1) & (test['loan_status']==0)
    fp = float(len(predictions[fp_filter]))
    
    tn_filter = (predictions==0) & (test['loan_status']==0)
    tn = float(len(predictions[tn_filter]))
    
    fpr = round((fp/(fp + tn)),4)
    
    return fpr

In [561]:
# a function that print accuracy score, confusion matrix, classification report and fpr
def result(model):
    predictions = model.predict(test[features])
    fpr = error_metric(predictions)
    confusion_mat = confusion_matrix(test[target], predictions)

    print("accuracy", round(model.score(test[features], test[target]),4))
    print(confusion_mat)
    print(classification_report(test[target], predictions))
    print("fpr", fpr)
   

Logistic Regression

In [562]:
# Logistics Regression
features = loans.columns.tolist()
features.remove('loan_status')
target = 'loan_status'

logit_model = LogisticRegression()
logit_model.fit(train[features], train[target])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [563]:
result(logit_model)

('accuracy', 0.8746)
[[   1 1207]
 [   4 8444]]
              precision    recall  f1-score   support

           0       0.20      0.00      0.00      1208
           1       0.87      1.00      0.93      8448

   micro avg       0.87      0.87      0.87      9656
   macro avg       0.54      0.50      0.47      9656
weighted avg       0.79      0.87      0.82      9656

('fpr', 0.9992)


In [564]:
predictions = logit_model.predict(test[features])
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [565]:
# penalize the classifier
logit_model = LogisticRegression(class_weight='balanced')
logit_model.fit(train[features], train[target])
predictions = logit_model.predict(test[features])

result(logit_model)

('accuracy', 0.7551)
[[ 384  824]
 [1541 6907]]
              precision    recall  f1-score   support

           0       0.20      0.32      0.25      1208
           1       0.89      0.82      0.85      8448

   micro avg       0.76      0.76      0.76      9656
   macro avg       0.55      0.57      0.55      9656
weighted avg       0.81      0.76      0.78      9656

('fpr', 0.6821)


In [566]:
# manually penalize the classifier
penalty = {0:10, 1:1}
logit_model = LogisticRegression(class_weight=penalty)
logit_model.fit(train[features], train[target])
predictions = logit_model.predict(test[features])

result(logit_model)

('accuracy', 0.3404)
[[1012  196]
 [6173 2275]]
              precision    recall  f1-score   support

           0       0.14      0.84      0.24      1208
           1       0.92      0.27      0.42      8448

   micro avg       0.34      0.34      0.34      9656
   macro avg       0.53      0.55      0.33      9656
weighted avg       0.82      0.34      0.39      9656

('fpr', 0.1623)


In [None]:
loans = pd.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False)

loans = loans.dropna(thresh=half, axis=1)
loans = loans.drop(['funded_amnt','funded_amnt_inv','issue_d','out_prncp','out_prncp_inv','total_pymnt',
                    'total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries',
                    'collection_recovery_fee','last_pymnt_d','last_pymnt_amnt','disbursement_method'], axis=1)
unique_values = loans.apply(pd.Series.nunique,axis=0)
cols_with_one_value = unique_values[unique_values == 1]
loans = loans.drop(cols_with_one_value.index, axis=1)
loans = loans.drop(['emp_title','desc','title','zip_code','earliest_cr_line','last_credit_pull_d'],axis=1)
loans = loans[(loans.loan_status == 'Fully Paid') | (loans.loan_status == 'Charged Off')]
loans = loans.replace('Fully Paid', 1)
loans = loans.replace('Charged Off', 0)
loans = loans.replace(mapping_dict)
loans = loans.drop('pub_rec_bankruptcies', axis=1)
loans = loans.dropna(axis=0)
loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype('float')
loans['revol_util'] = loans['revol_util'].str.rstrip('%').astype('float')
loans = loans.drop(['acc_now_delinq','delinq_amnt','tax_liens'], axis=1)

dummy_df = pd.get_dummies(loans[['home_ownership','verification_status','purpose','term','grade','sub_grade',
                                 'addr_state','debt_settlement_flag']])
loans = pd.concat([loans,dummy_df],axis=1)
loans = loans.drop(['home_ownership','verification_status','purpose','term', 'grade','sub_grade',
                                 'addr_state','debt_settlement_flag'],axis=1)

train = loans.iloc[0:int(loans.shape[0] * 0.75)]
test = loans.iloc[int(loans.shape[0] * 0.75):]
features = loans.columns.tolist()
features.remove('loan_status')
target = 'loan_status'

In [None]:
# random forest
rf = RandomForestClassifier(random_state=1, class_weight='balanced')

rf.fit(train[features], train[target])
predictions = rf.predict(test[features])

result(rf)

In [None]:
# SVM
svm = SVC(gamma='auto')

svm.fit(train[features], train[target])
predictions = svm.predict(test[features])

result(rf)