In [30]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import util
%matplotlib inline

In [34]:
def import_data(year, nrows=None):
    fname = '../LoanStats3c.csv'
    
    if nrows:
        df = pd.read_csv(fname, skiprows=1, nrows=nrows)
    else:
        df = pd.read_csv(fname, skiprows=1)
    return df

In [32]:
df = import_data('XXXX') # not sure what year this is

TypeError: not all arguments converted during string formatting

In [33]:
df.replace('n/a', np.nan,inplace=True)

In [5]:
def conv_loan_status(val):
    if pd.isnull(val):
        return val
    else:
        return (val.find('Fully Paid') > -1)
    
def binarized_loan_status(category, df):
    return df['loan_status'].apply(conv_loan_status)

def cleaned_df(features, df):
    feature_df = df[features].dropna(axis=0)
    return feature_df    

In [6]:
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'].fillna(0, axis=0, inplace=True)
df['emp_length'] = df['emp_length'].astype(int)

In [7]:
df['dti'].head()

0    14.45
1     6.79
2    23.45
3     9.54
4    21.75
Name: dti, dtype: float64

In [8]:
features = [
    'loan_amnt',
    'int_rate',
    'dti',
    'annual_inc',
    'avg_cur_bal',
    'installment', 
    'emp_length', 
    'purpose', #binarize
    'pub_rec_bankruptcies',
    'delinq_2yrs', # fill na, to int
    'grade', #binarize
    'home_ownership', # binarize
    'loan_status'
]

In [9]:
# Grab df with only the features we want. Drop na
df_features = df[features]
df_features_noncurr = df_features[df_features['loan_status'] != 'Current']
df_features_noncurr = df_features_noncurr.dropna(axis=0)

In [10]:
for i in features:
    s = set(df_features_noncurr[i])
    if len(s) < 20:
        print s, i

set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) emp_length
set(['debt_consolidation', 'renewable_energy', 'house', 'medical', 'wedding', 'vacation', 'credit_card', 'educational', 'other', 'moving', 'car', 'small_business', 'major_purchase', 'home_improvement']) purpose
set([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 7.0]) pub_rec_bankruptcies
set(['A', 'C', 'B', 'E', 'D', 'G', 'F']) grade
set(['OWN', 'RENT', 'MORTGAGE']) home_ownership
set(['Late (31-120 days)', 'Default', 'Fully Paid', 'Late (16-30 days)', 'Charged Off', 'In Grace Period']) loan_status


In [11]:
util.binarize_category('purpose', df_features_noncurr)

In [12]:
util.binarize_category('home_ownership', df_features_noncurr)

In [13]:
def map_grade(g):
    grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
    return grades.index(g)

df_features_noncurr['grade'] = df_features_noncurr['grade'].apply(map_grade)

In [14]:
df_features_noncurr.drop(['purpose', 'home_ownership'], axis=1, inplace=True)

In [15]:
ys = df_features_noncurr['loan_status'].apply(conv_loan_status)
df_features_noncurr.drop(['loan_status'], axis=1, inplace=True)

In [16]:
df_features_noncurr['int_rate'] = df_features_noncurr['int_rate'].apply(util.clean_percent)

In [17]:
y = ys
X = df_features_noncurr

In [18]:
to_norm_features = [
    'loan_amnt',
    'int_rate',
    'dti',
    'annual_inc',
    'avg_cur_bal',
    'installment',
    'emp_length',
    'pub_rec_bankruptcies',
    'delinq_2yrs',
    'grade'
]
for f in to_norm_features:
    util.normalize_column(f, X, inplace=True)

In [19]:
def normalize_features(feats, df):
    for f in feats:
        util.normalize_column(f, df, inplace=True)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=37)
weights = {True: 0.3, False: 0.7}
svc = SVC(class_weight=weights, random_state=1)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight={False: 0.7, True: 0.3}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
print "Train score: %.2f" %svc.score(X_train, y_train)
cm_train = confusion_matrix(y_train, svc.predict(X_train), labels=[True, False])

print "Testing score:  %.2f" %svc.score(X_test, y_test)
cm_test = confusion_matrix(y_test, svc.predict(X_test), labels=[True, False])


print "Training confusion matrix:"
print cm_train / np.sum(cm_train, axis=0)
print "Test confusion matrix:"
print cm_test / np.sum(cm_test, axis=0)

Train score: 0.57
Testing score:  0.57
Training confusion matrix:
[[ 0.80989632  0.56937484]
 [ 0.19010368  0.43062516]]
Test confusion matrix:
[[ 0.80590717  0.56947391]
 [ 0.19409283  0.43052609]]




In [None]:
# Label true = fullpaid, false = late or default
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=37)
rf_regressor = RandomForestClassifier(n_estimators=100, min_samples_split=10, max_depth=20, max_features=20, n_jobs=-1)
rf_regressor.fit(X_train, y_train)
print "Train score: %.2f" %rf_regressor.score(X_train, y_train)
cm_train = confusion_matrix(y_train, rf_regressor.predict(X_train), labels=[True, False])

print "Testing score:  %.2f" %rf_regressor.score(X_test, y_test)
cm_test = confusion_matrix(y_test, rf_regressor.predict(X_test), labels=[True, False])


print "Training confusion matrix:"
print cm_train / np.sum(cm_train, axis=0)
print "Test confusion matrix:"
print cm_test / np.sum(cm_test, axis=0)
'''

In [39]:
df = pd.read_csv('../LoanStats3a.csv', skiprows=1)
g = df[df['loan_status'] != 'Current'].groupby('loan_status')
print g.size() / (np.sum(g.size()))

loan_status
Charged Off                                            0.135517
Default                                                0.000072
Does not meet the credit policy. Status:Charged Off    0.018305
Does not meet the credit policy. Status:Fully Paid     0.047818
Fully Paid                                             0.797037
In Grace Period                                        0.000481
Late (16-30 days)                                      0.000192
Late (31-120 days)                                     0.000577
dtype: float64


loan_status
Charged Off           0.172470
Default               0.000297
Fully Paid            0.810685
In Grace Period       0.006668
Late (16-30 days)     0.001937
Late (31-120 days)    0.007944
dtype: float64


In [40]:
svc

SVC(C=1.0, cache_size=200, class_weight={False: 0.7, True: 0.3}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
df_train = pd.read_csv('../LoanStats3b.csv', skiprows=1)


In [47]:
df_train.replace('n/a', np.nan,inplace=True)
df_train['int_rate'] = df_train['int_rate'].apply(util.clean_percent)


In [54]:
df_train['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df_train['emp_length'].fillna(0, axis=0, inplace=True)
df_train['emp_length'] = df_train['emp_length'].astype(int)

In [59]:
df_train['grade'].apply(lambda x: ['A', 'B', 'C', 'D', 'E', 'F', 'G'].index(x))

ValueError: nan is not in list

In [61]:
def transform_df(df, features, normed_features, binarized_features):
    feature_df = df[features]
    feature_df = feature_df.dropna(axis=0)
    print set(feature_df['grade'])
    feature_df['grade'] = feature_df['grade'].apply(lambda x: ['A', 'B', 'C', 'D', 'E', 'F', 'G'].index(x))
    
    for i in normed_features:
        print i
        util.normalize_column(i, feature_df, inplace=True)
    for f in binarized_features:
        util.binarize_category(f, feature_df)
        feature_df.drop(f, axis=1)
    return feature_df

bin_feats = ['purpose', 'home_ownership']
df_feat = transform_df(df_train, features, to_norm_features, bin_feats)

set(['A', 'C', 'B', 'E', 'D', 'G', 'F'])
loan_amnt
int_rate
dti
annual_inc
avg_cur_bal
installment
emp_length
pub_rec_bankruptcies
delinq_2yrs
grade


In [68]:
df_feat.drop(bin_feats, axis=1, inplace=True)

In [71]:
samples = int(len(df_feat) * 0.2)
sampled = df_feat.sample(n=samples)

In [72]:
pred_samps = svc.predict_proba(samples[:5])
print pred_samps

AttributeError: predict_proba is not available when  probability=False

In [192]:
g = sampled.groupby('grade')
g.size()

grade
-0.299476     4457
-0.132810    10688
 0.033857     8822
 0.200524     4829
 0.367190     2131
 0.533857      966
 0.700524      182
dtype: int64

In [149]:
c_loans = g.get_group(0.033856894630950617)

In [150]:
g.groups.keys()

[0.36719022796428397,
 -0.13280977203571606,
 0.70052356129761728,
 0.20052356129761728,
 0.53385689463095065,
 0.033856894630950617,
 -0.29947643870238272]

In [152]:
sample = c_loans.sample(n=1000)

In [154]:
sample_test = sample.drop('loan_status', axis=1)
sample_test.drop(['home_ownership_OTHER', 'home_ownership_NONE'], axis=1, inplace=True)
#X_train.drop(['purpose_educational'], axis=1, inplace=True)
predictions = svc.predict(sample_test)

In [83]:
X_train.columns

Index([u'loan_amnt', u'int_rate', u'dti', u'annual_inc', u'avg_cur_bal',
       u'installment', u'emp_length', u'pub_rec_bankruptcies', u'delinq_2yrs',
       u'grade', u'purpose_debt_consolidation', u'purpose_renewable_energy',
       u'purpose_house', u'purpose_medical', u'purpose_wedding',
       u'purpose_vacation', u'purpose_credit_card', u'purpose_educational',
       u'purpose_other', u'purpose_moving', u'purpose_car',
       u'purpose_small_business', u'purpose_major_purchase',
       u'purpose_home_improvement', u'home_ownership_OWN',
       u'home_ownership_RENT', u'home_ownership_MORTGAGE'],
      dtype='object')

In [86]:
s_test = set(sample_test)
col_train = set(X_train.columns)
print s_test.difference(col_train)
print col_train.difference(s_test)

set(['home_ownership_OTHER', 'home_ownership_NONE'])
set(['purpose_educational'])


In [95]:
#sample_test.drop(['home_ownership_OTHER', 'home_ownership_NONE'], axis=1, inplace=True)
#X_train.drop(['purpose_educational'], axis=1, inplace=True)
#X_test.drop(['purpose_educational'], axis=1, inplace=True)
svc = SVC(class_weight=weights, random_state=1, probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight={False: 0.7, True: 0.3}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False)

In [99]:
print "Train score: %.2f" %svc.score(X_train, y_train)
cm_train = confusion_matrix(y_train, svc.predict(X_train), labels=[True, False])

print "Testing score:  %.2f" %svc.score(X_test, y_test)
cm_test = confusion_matrix(y_test, svc.predict(X_test), labels=[True, False])


print "Training confusion matrix:"
print cm_train / np.sum(cm_train, axis=0)
print "Test confusion matrix:"
print cm_test / np.sum(cm_test, axis=0)

Train score: 0.57
Testing score:  0.57
Training confusion matrix:
[[ 0.80996594  0.57041569]
 [ 0.19003406  0.42958431]]
Test confusion matrix:
[[ 0.80636801  0.57043368]
 [ 0.19363199  0.42956632]]


In [167]:
def predict_num(n, samples):
    x = list(svc.predict_proba(samples[:n])) 
    x = map(lambda z: list(z), x)
    
    return sorted(x, reverse=True)

In [241]:
def get_loans(grade, sampled_df):
    # 0 = A, 1= B etc
    g = sampled_df.groupby('grade')
    grades = sorted(g.groups.keys())
    return g.get_group(grades[grade])
    #return g.get_group(grades[grade]).drop(['loan_status','home_ownership_OTHER', 'home_ownership_NONE'] , axis=1)

In [185]:
loans_dict = {i: get_loans(i, sampled) for i in range(6)}

In [194]:
for i in range(6):
    print "Top 10 for grade %d" %i
    print predict_num(10, loans_dict[i])

In [204]:
def add_predictions_column(df):
    df_copy = df.copy()
    df_copy['predictions'] = [ i[1] for i in svc.predict_proba(df)]
    return df_copy

In [200]:
a_loans = add_predictions_column(loans_dict[0])

In [205]:
loan_predictions = {0: a_loans}
for i in range(1,4):
    loan_predictions[i] = add_predictions_column(loans_dict[i])

In [250]:
selected_indices = {}
for i in range(0, 4):
    loan_predictions[i] = loan_predictions[i].sort(columns='predictions', ascending=True)
    selected_indices[i] = loan_predictions[i].index.values[:300]

  app.launch_new_instance()


In [264]:
avg_payout_prop = 0
for i in range(4):
    _df = df_train.ix[selected_indices[i]]
    avg_payout_prop += np.sum((_df['total_pymnt'] - _df['collection_recovery_fee'])/_df['loan_amnt'] )
avg_payout_prop /= 1200

In [265]:
df_train.ix[loan_predictions[0].index.values[:10]]['grade']

20974     A
12472     A
3661      A
45836     A
27474     A
129008    A
42375     A
7048      A
21004     A
131285    A
Name: grade, dtype: object

In [266]:
avg_payout_prop

1.0517997424833119

In [229]:
model_payout_prop = avg_payout_prop

In [242]:
grouped_grades = df_train.groupby('grade')
print grouped_grades.size()
num_sample = 300
control_payout_prop = 0
for i in range(4):
    control_df = get_loans(i, df_train).sample(n=num_sample)
    control_payout_prop += np.sum((control_df['total_pymnt'] - control_df['collection_recovery_fee'])/control_df['loan_amnt'] )

control_payout_prop/= 1200

grade
A    28576
B    62605
C    49988
D    27881
E    12242
F     5706
G     1125
dtype: int64


In [244]:
control_df = pd.read_csv('../LoanStats3b.csv', skiprows=1)

In [245]:
grouped_grades = control_df.groupby('grade')
print grouped_grades.size()
num_sample = 300
control_payout_prop = 0
for i in ['A', 'B', 'C', 'D']:
    control_df = grouped_grades.get_group(i).sample(n=num_sample)
    control_payout_prop += np.sum((control_df['total_pymnt'] - control_df['collection_recovery_fee'])/control_df['loan_amnt'] )

control_payout_prop/= 1200

grade
A    28576
B    62605
C    49988
D    27881
E    12242
F     5706
G     1125
dtype: int64


In [246]:
control_payout_prop

1.0524686143622175