In [1]:
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import log_loss
from scipy.stats import boxcox
import xgboost as xgb
import numpy as np
import pandas as pd

In [13]:
test_id = pd.read_csv('Project3_test_id.csv')
data = pd.read_csv('loan_stat542.csv', index_col = 'id')

In [14]:
y = data['loan_status'].apply(lambda x: 0 if x == 'Fully Paid' else 1)
X = data.drop('loan_status', axis = 1)

In [15]:
X['fico_range'] = 0.5*X['fico_range_low'] + 0.5*X['fico_range_high']
X['log_annual_inc'] = np.log(X['annual_inc'] + 1)

drop_col = ['grade', 'emp_title', 'title', 'zip_code', 'open_acc', 'revol_bal', 'total_acc']
nominal = ['sub_grade', 'verification_status', 'term', 'home_ownership', 'purpose', 
           'addr_state', 'initial_list_status', 'application_type']
numeric = ['loan_amnt', 'int_rate', 'installment', 'log_annual_inc', 'dti', 'earliest_cr_line', 
           'fico_range', 'revol_util', 'emp_length', 'mort_acc', 'pub_rec_bankruptcies', 'pub_rec']

In [16]:
def variable_selection(df, y, p = 0.01):
    corr = df.join(y).corr()['loan_status']
    result_col = [x for x in df.columns if abs(corr[x]) > p]
    return result_col

def preprocess_numeric(df1):
    df = df1.copy()
    df['earliest_cr_line'] = df['earliest_cr_line'].apply(lambda x: int(x[-4:]))
    
    df['emp_length'].replace('10+ years', '10 years', inplace=True)
    df['emp_length'].replace('< 1 year', '0 years', inplace=True)
    df['emp_length'] = df['emp_length'].apply(lambda x: int(x.split(' ')[0]) if not pd.isnull(x) else x)
    
    imputer = Imputer()
    missing_col = df.columns[df.isnull().any()]
    df[missing_col] = imputer.fit_transform(df[missing_col].values)
    df = df.apply(lambda x: x.clip(upper = x.quantile(0.95), lower = x.quantile(0.05)))
    #lam = 0.15
    #for col in df.columns:
        #if abs(df[col].skew()) > 1:
            #df[col] = boxcox(df[col] + 0.00001, lam)
    return df

# encode categories for all columns in df
def encode_nominal(df):
    return pd.get_dummies(df)

# combine all functions into preprocessing
def preprocess(df, numeric, nominal):
    result = encode_nominal(df[nominal]).join(preprocess_numeric(df[numeric]))
    return result

In [17]:
X = preprocess(X, numeric, nominal)

In [23]:
error = []

for i in range(1, 4):
    train_X = X[~X.index.isin(test_id['test%i' % i])]
    train_y = y[~X.index.isin(test_id['test%i' % i])]

    keep_col = variable_selection(train_X, train_y)

    X1 = X[keep_col]
    X_train = X1[~X.index.isin(test_id['test%i' % i])]
    y_train = y[~y.index.isin(test_id['test%i' % i])]
    X_test = X1[X.index.isin(test_id['test%i' % i])]
    y_test = y[y.index.isin(test_id['test%i' % i])]

    xgb1 = xgb.XGBRegressor(objective ='reg:linear', subsample = 0.8, colsample_bytree = 0.8, learning_rate = 0.1,
                           max_depth = 7, alpha = 10, n_estimators = 100, min_child_weight = 6)
    xgb1.fit(X_train, y_train)
    y_pred_xgb = xgb1.predict(X_test)
    y_pred_xgb[y_pred_xgb < 0] = np.exp(-5)

    error.append(log_loss(y_test, y_pred_xgb))

In [24]:
np.mean(error)

0.46009503230468723

In [22]:
np.mean(error)

0.45938091523140695