In [1]:
## 3. Reading in to Pandas ##

import pandas as pd
loans_2007 = pd.read_csv("loans_2007.csv")
print(loans_2007.head(1))
print(len(loans_2007.columns))

## 5. First group of columns ##

loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis =1, inplace = True)

## 7. Second group of features ##

loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis =1, inplace = True)

## 9. Third group of features ##

loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis = 1, inplace=True)
print(loans_2007.head(1))
print(len(loans_2007.columns))

## 10. Target column ##

loans_2007["loan_status"].value_counts()

## 12. Binary classification ##

loans_2007 = loans_2007[(loans_2007["loan_status"] == "Fully Paid") | (loans_2007["loan_status"] == "Charged Off")]

mapping_dict = { "loan_status": { "Fully Paid" :  1, "Charged Off" : 0}}
loans_2007.replace(mapping_dict, inplace = True)

## 13. Removing single value columns ##

drop_columns = []
for column in loans_2007.columns:
    non_null = loans_2007[column].dropna()
    unique_non_null = non_null.unique()
    if len(unique_non_null) == 1:
        drop_columns.append(column)

loans_2007.drop(drop_columns, axis=1, inplace = True)
print(drop_columns)

  interactivity=interactivity, compiler=compiler, result=result)


        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   

  int_rate  installment grade sub_grade    ...    last_pymnt_amnt  \
0   10.65%       162.87     B        B2    ...             171.62   

  last_credit_pull_d collections_12_mths_ex_med  policy_code application_type  \
0           Jun-2016                        0.0          1.0       INDIVIDUAL   

  acc_now_delinq chargeoff_within_12_mths delinq_amnt pub_rec_bankruptcies  \
0            0.0                      0.0         0.0                  0.0   

  tax_liens  
0       0.0  

[1 rows x 52 columns]
52
   loan_amnt        term int_rate  installment emp_length home_ownership  \
0     5000.0   36 months   10.65%       162.87  10+ years           RENT   

   annual_inc verification_status loan_status pymnt_plan    ...      \
0     24000.0            Verified  Fully Paid          n    ...       

  initial_list_status last_credit_

In [3]:
## 1. Recap ##

import pandas as pd
loans = loans_2007
null_counts = loans.isnull().sum()

## 2. Handling missing values ##

loans.drop(["pub_rec_bankruptcies"], axis =1, inplace = True)
loans.dropna(inplace = True)
'''
for col in loans.columns:
    print(loans[col].dtypes)
    print(loans[col].value_counts)
'''
print(loans.dtypes.value_counts())

## 3. Text columns ##

object_columns_df = loans.select_dtypes(include = ["object"])
object_columns_df.head(1)

## 5. First 5 categorical columns ##

cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for col in cols:
    print("Col: " + col)
    print(object_columns_df[col].value_counts())

## 6. The reason for the loan ##

print(object_columns_df["purpose"].value_counts())
print(object_columns_df["title"].value_counts())

## 7. Categorical columns ##

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}

loans.drop(["last_credit_pull_d", "addr_state", "title", "earliest_cr_line"], axis =1, inplace = True)
loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype(float)
loans['revol_util'] = loans['revol_util'].str.rstrip('%').astype(float)

loans = loans.replace(mapping_dict)
print(loans.head(1))

## 8. Dummy variables ##

cat_columns = ["home_ownership", "verification_status", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)

object     11
float64    10
int64       1
dtype: int64
Col: home_ownership
RENT        18513
MORTGAGE    17112
OWN          2984
OTHER          96
NONE            3
Name: home_ownership, dtype: int64
Col: verification_status
Not Verified       16696
Verified           12290
Source Verified     9722
Name: verification_status, dtype: int64
Col: emp_length
10+ years    8545
< 1 year     4513
2 years      4303
3 years      4022
4 years      3353
5 years      3202
1 year       3176
6 years      2177
7 years      1714
8 years      1442
9 years      1229
n/a          1032
Name: emp_length, dtype: int64
Col: term
 36 months    29041
 60 months     9667
Name: term, dtype: int64
Col: addr_state
CA    6958
NY    3713
FL    2791
TX    2667
NJ    1798
IL    1483
PA    1473
VA    1376
GA    1364
MA    1301
OH    1179
MD    1026
AZ     850
WA     822
CO     770
NC     753
CT     730
MI     712
MO     671
MN     603
NV     481
SC     462
WI     441
AL     437
OR     436
LA     430
KY     315
OK     29

In [7]:

# tn = (predictions == 0) & (loans["loan_status"] == 0)
# tp = (predictions == 1) & (loans["loan_status"] == 1)
# fn = (predictions == 0) & (loans["loan_status"] == 1)
# fp = (predictions == 1) & (loans["loan_status"] == 0)

## 5. Class imbalance ##

import pandas as pd
import numpy

# Predict that all loans will be paid off on time.
predictions = pd.Series(numpy.ones(loans.shape[0]))
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

## 6. Logistic Regression ##

from sklearn.linear_model import LogisticRegression
import numpy as np
lr = LogisticRegression()
features = list(loans.columns)
features.remove("loan_status")
target = loans["loan_status"]
lr.fit(loans[features], target)
predictions = lr.predict(loans[features])

## 7. Cross Validation ##

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(np.shape(features)[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv = kf)
predictions = pd.Series(predictions)
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

## 9. Penalizing the classifier ##

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
lr = LogisticRegression(class_weight = "balanced")
kf = KFold(features.shape[0], random_state = 1)
predictions = cross_val_predict(lr, features, target, cv = kf)
predictions = pd.Series(predictions)
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

## 10. Manual penalties ##

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
penalty = {0:10, 1:1}
lr = LogisticRegression(class_weight = penalty)
kf = KFold(features.shape[0], random_state = 1)
predictions = cross_val_predict(lr, features, target, cv = kf)
predictions = pd.Series(predictions)
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

## 11. Random forests ##

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_predict
lr = RandomForestClassifier(class_weight = "balanced", random_state = 1)
kf = KFold(features.shape[0], random_state = 1)
predictions = cross_val_predict(lr, features, target, cv = kf)
predictions = pd.Series(predictions)
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
'''
We can tweak the penalties further.
We can try models other than a random forest and logistic regression.
We can use some of the columns we discarded to generate better features.
We can ensemble multiple models to get more accurate predictions.
We can tune the parameters of the algorithm to achieve higher performance.
'''

ValueError: Found input variables with inconsistent numbers of samples: [37, 38708]