In [90]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.metrics import log_loss

In [91]:
test_id = pd.read_csv('Project3_test_id.csv')
data = pd.read_csv('loan_stat542.csv', index_col = 'id')

In [92]:
y = (data['loan_status'] == 'Charged Off').apply(np.uint8)
X = data.drop('loan_status', axis=1)

In [93]:
X['fico_score'] = 0.5*X['fico_range_low'] + 0.5*X['fico_range_high']
X['log_annual_inc'] = np.log(X['annual_inc'] + 1)
X['earliest_cr_line'] = X['earliest_cr_line'].apply(lambda x: int(x[-4:]))
X['emp_length'].replace('10+ years', '10 years', inplace=True)
X['emp_length'].replace('< 1 year', '0 years', inplace=True)
X['home_ownership'].replace(['NONE', 'ANY'], 'OTHER', inplace=True)
def emp_length_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
X['emp_length'] = X['emp_length'].apply(emp_length_to_int)
    
#columns dropped: id, grade, emp_title, title, zip_code, open_acc, revol_bal, total_acc
nominal = ['sub_grade', 'verification_status', 'home_ownership', 'purpose',
           'addr_state', 'initial_list_status', 'application_type','term']
numeric = ['loan_amnt', 'int_rate', 'installment', 'dti', 'earliest_cr_line', 'revol_util',
           'emp_length', 'mort_acc', 'pub_rec_bankruptcies', 'pub_rec','fico_score','log_annual_inc']

In [94]:
# get_dummies for nominal variables
def preprocess_nominal(data):
    return pd.get_dummies(data)

#Missing value impute and winsorize
def preprocess_numeric(data):
    result = data.copy()
    imputer = Imputer()
    missing_col = result.columns[result.isnull().any()]
    result[missing_col] = imputer.fit_transform(result[missing_col].values)
    result = result.apply(lambda x: x.clip(upper = x.quantile(0.95), lower = x.quantile(0.05)))
    return result

# correlation to repsonse variable
def corr(data, y, threshold = 0.01):
    corr = data.join(y).corr()['loan_status']
    result = [x for x in data.columns if abs(corr[x]) > threshold]
    return result

In [95]:
#preprocess
X = preprocess_nominal(X[nominal]).join(preprocess_numeric(X[numeric]))

In [96]:
error = []
submission=[]
for i in range(1, 4):
    train_X = X[~X.index.isin(test_id['test%i' % i])]
    train_y = y[~X.index.isin(test_id['test%i' % i])]
    keep_col = corr(train_X, train_y)
    X1 = X[keep_col]
    X_train = X1[~X.index.isin(test_id['test%i' % i])]
    y_train = y[~y.index.isin(test_id['test%i' % i])]
    X_test = X1[X.index.isin(test_id['test%i' % i])]
    y_test = y[y.index.isin(test_id['test%i' % i])]
    xgb1 = xgb.XGBRegressor(objective ='reg:linear', subsample = 0.8, colsample_bytree = 0.8, learning_rate = 0.1,
                           max_depth = 7, n_estimators = 100, min_child_weight = 8)
    xgb1.fit(X_train, y_train)
    y_pred_xgb = xgb1.predict(X_test)
    y_pred_xgb[y_pred_xgb < 0] = np.exp(-4)
    submission.append(y_pred_xgb)
    error.append(log_loss(y_test, y_pred_xgb))

In [52]:
np.mean(error)

0.45038093487610098

In [87]:
np.mean(error)

0.45008157763090689

In [88]:
error

[0.44969613458811419, 0.45078262782122636, 0.44976597048338013]

In [78]:
np.mean(error)

0.45009943177883849

In [79]:
error

[0.44970991580638436, 0.45080421532171339, 0.44978416420841766]

In [89]:
y_pred_xgb

array([ 0.30380875,  0.07445666,  0.24146089, ...,  0.13808429,
        0.0605081 ,  0.26541919], dtype=float32)

In [97]:
submission

[array([ 0.41835696,  0.2997334 ,  0.14832196, ...,  0.34157273,
         0.10189536,  0.24250284], dtype=float32),
 array([ 0.20777592,  0.05084082,  0.07622164, ...,  0.15521353,
         0.2396231 ,  0.15917879], dtype=float32),
 array([ 0.30380875,  0.07445666,  0.24146089, ...,  0.13808429,
         0.0605081 ,  0.26541919], dtype=float32)]

In [98]:
np.mean(error)

0.45008157763090689

In [99]:
error

[0.44969613458811419, 0.45078262782122636, 0.44976597048338013]