In [1]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor



In [2]:
#Set locations
train_data_location = 'C:/Users/darre/Documents/Algospark/HSBC/loan-default-prediction/train_v2.csv'
test_data_location = 'C:/Users/darre/Documents/Algospark/HSBC/loan-default-prediction/test_v2.csv'
save_data_location = 'C:/Users/darre/Documents/Algospark/HSBC/loan-default-prediction/preds.csv'

In [3]:
## Import data
train_data = pd.read_csv(train_data_location)
test_data = pd.read_csv(test_data_location)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# View data
train_data.head()
### placeholder: data analytics goes here.

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f770,f771,f772,f773,f774,f775,f776,f777,f778,loss
0,1,126,10,0.686842,1100,3,13699,7201.0,4949.0,126.75,...,5,2.14,-1.54,1.18,0.1833,0.7873,1,0,5,0
1,2,121,10,0.782776,1100,3,84645,240.0,1625.0,123.52,...,6,0.54,-0.24,0.13,0.1926,-0.6787,1,0,5,0
2,3,126,10,0.50008,1100,3,83607,1800.0,1527.0,127.76,...,13,2.89,-1.73,1.04,0.2521,0.7258,1,0,5,0
3,4,134,10,0.439874,1100,3,82642,7542.0,1730.0,132.94,...,4,1.29,-0.89,0.66,0.2498,0.7119,1,0,5,0
4,5,109,9,0.502749,2900,4,79124,89.0,491.0,122.72,...,26,6.11,-3.82,2.51,0.2282,-0.5399,0,0,5,0


In [5]:
#Shape of train data
train_data.shape

(105471, 771)

In [6]:
# Shape of test data (note test data does not have "loss" column)
test_data.shape

(210944, 770)

In [7]:
## Data clean. Remove type=object columns and columns with no info..
for i in train_data.select_dtypes(include=['object']).columns:
    train_data.drop(labels=i, axis=1, inplace=True)
    
for i in train_data.columns:
    if len(set(train_data[i]))==1:
        train_data.drop(labels=[i], axis=1, inplace=True)
        
train_data.shape

(105471, 742)

In [8]:
# Align test dataframe to same columns as in training
unique=set(train_data.columns).intersection(set(test_data.columns))
test_data2= test_data[test_data.columns.intersection(unique)]
test_data2.shape
#Should be 1 column less due to missing "loss" column.

(210944, 741)

In [9]:
# Create a default metric, ie loss = 1 or 0 on training set
train_data['default'] = train_data.loss.apply(lambda x: 1 if x > 0 else 0)

In [10]:
# Set gaps to median and remove any NA values.
cleaned_data = train_data.fillna(train_data.median())
cleaned_data.dropna(axis=0)
cleaned_data.shape

(105471, 743)

In [11]:
#Split training data into train & validate sets
## Leave loss value in for later split of dataframe

features = cleaned_data.drop(axis=1, labels=['default','id'])
targets = pd.DataFrame(cleaned_data['default'])
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size = 0.2, random_state = 73)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(84376, 741)
(21095, 741)
(84376, 1)
(21095, 1)


In [12]:
# Remove loss value, normalize the  data & convert to arrays
X_train_scaled = X_train.drop(axis=1, labels='loss')
X_val_scaled = X_val.drop(axis=1, labels='loss')

X_train_scaled = sc.fit_transform(X_train_scaled)
X_val_scaled = sc.transform(X_val_scaled)
y_train_scaled = np.array(y_train).reshape((-1, ))
y_val_scaled = np.array(y_val).reshape((-1, ))
print(X_train_scaled.shape)
print(X_val_scaled.shape)
print(y_train_scaled.shape)
print(y_val_scaled.shape)

(84376L, 740L)
(21095L, 740L)
(84376L,)
(21095L,)


In [13]:
#Random fprest classifier
rf_classifier = RandomForestClassifier(n_estimators = 20, max_depth=20, min_samples_split=5, random_state=73)

In [14]:
## Find optimal F1 for a grid of cutoffs
def bestF1(obs,pred):
    best = 0
    bestcut = 0
    for cutoff in np.arange(0.01,0.99,0.01):
        tmp = f1_score(obs,pd.Series(pred > cutoff).apply(lambda x: 1 if x else 0))
        if tmp > best:
            best = tmp
            bestcut = cutoff
    return best

In [15]:
#Fit classifier model
model1 =rf_classifier.fit(X_train_scaled,y_train_scaled)
# Placeholder for cross validation <here>

In [16]:
# Evaluation metrics for training data default prediction
y_train_preds = model1.predict_proba(X_train_scaled)[:,1]
y_train_include = np.where(y_train_preds<0.5,0,1)
accuracy_train = accuracy_score(y_train_scaled,y_train_include)
F1_train = bestF1(y_train_scaled,y_train_preds)
AUC_train = roc_auc_score(y_train_scaled,y_train_preds)
print accuracy_train
print F1_train
print AUC_train


  'precision', 'predicted', average, warn_for)


0.927586043425
0.894473684211
0.99029440777


In [17]:
# Evaluation metrics for validation data default prediction 
y_val_preds = model1.predict_proba(X_val_scaled)[:,1]
y_val_include = np.where(y_val_preds<0.5,0,1)
accuracy_val = accuracy_score(y_val_scaled,y_val_include)
F1_val = bestF1(y_val_scaled,y_val_preds)
AUC_val = roc_auc_score(y_val_scaled,y_val_preds)
print accuracy_val
print F1_val
print AUC_val

0.909267598957
0.218712514994
0.645469832862


In [19]:
### Random forest for loss value prediction
rf_predictor = RandomForestRegressor(n_estimators=200,max_depth=20,min_samples_split=2,random_state=73)

In [20]:
### Subset the training set based on instances classified as default
X_train_loss = X_train #take the dataframe with the loss information
X_train_loss['pred_loss']=y_train_include #Add predicted loss for subsetting
X_train_loss_zero = X_train_loss[X_train_loss['pred_loss']==0]

#Subset, transform and calculate based on predicted default category
X_train_loss_positive = X_train_loss[X_train_loss['pred_loss']>0]
y_train_loss_positive = pd.DataFrame(X_train_loss_positive['loss'])
y_train_loss_positive = np.array(y_train_loss_positive).reshape((-1, ))
X_train_default = X_train_loss_positive.drop(axis=1, labels=['pred_loss','loss']) ## Remove the columns for losses and predicted losses
X_train_loss_positive_scaled = sc.fit_transform(X_train_default) #Scale

#Same for the validation set
X_val_loss = X_val #take the dataframe with the loss information
X_val_loss['pred_loss']=y_val_include #Add predicted loss for subsetting
X_val_loss_zero = X_val_loss[X_val_loss['pred_loss']==0]
X_val_loss_positive = X_val_loss[X_val_loss['pred_loss']>0]
y_val_loss_positive = pd.DataFrame(X_val_loss_positive['loss'])
y_val_loss_positive = np.array(y_val_loss_positive).reshape((-1, ))
X_val_default = X_val_loss_positive.drop(axis=1, labels=['pred_loss','loss']) ## Remove the columns for losses and predicted losses
X_val_loss_positive_scaled = sc.fit_transform(X_val_default) #Scale


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [21]:
## Default prediction model
model2 =rf_predictor.fit(X_train_loss_positive_scaled,np.log(y_train_loss_positive))

In [22]:
# Calculate training MAE on predicted defaults for training data
preds_loss_train = np.e**model2.predict(X_train_loss_positive_scaled) ##convert back from logs
MAE_train = np.mean(np.abs(preds_loss_train-y_train_loss_positive))
print MAE_train

1.54376106702


In [23]:
# Calculate MAE of predicted loss across all samples for training data
X_train_loss_positive['pred_loss'] = preds_loss_train ### Convert the pred loss value into the predicted
X_train_with_loss = pd.concat([X_train_loss_positive,X_train_loss_zero])
MAE_train_all = np.mean(np.abs(X_train_with_loss['pred_loss'] - X_train_with_loss['loss'] ))
print MAE_train_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.731210898373


In [24]:
# Calculate validation MAE on predicted defaults for validation data
preds_loss_val = np.e**model2.predict(X_val_loss_positive_scaled) ##convert back from logs
MAE_val = np.mean(np.abs(preds_loss_val - y_val_loss_positive))
print MAE_val

4.58428108768


In [25]:
#Calculate MAE of predicted loss across all samples for validation data
X_val_loss_positive['pred_loss'] = preds_loss_val ### Convert the pred loss value into the predicted
X_val_with_loss = pd.concat([X_val_loss_positive,X_val_loss_zero])
MAE_val_all = np.mean(np.abs(X_val_with_loss['pred_loss'] - X_val_with_loss['loss'] ))
print MAE_val_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.742917412225


In [26]:
### Clean the test data
test_cleaned = test_data2.fillna(cleaned_data.median())
test_cleaned.fillna(0) #captures any NA's from median calc
test_id = test_cleaned['id']
test_cleaned = test_cleaned.drop(axis=1, labels=['id'])
test_cleaned.shape

(210944, 740)

In [27]:
#Transform test data
X_test_scaled = sc.fit_transform(test_cleaned)
print(X_test_scaled.shape)

(210944L, 740L)


In [28]:
## Get default predictions
y_test_preds = model1.predict_proba(X_test_scaled)[:,1]
y_test_include = np.where(y_test_preds<0.5,0,1)

In [29]:
# Prepare test data for loss predictions
X_test_loss = test_cleaned #take the dataframe with the loss information
X_test_loss['pred_loss']=y_test_include #Add predicted loss for subsetting
X_test_loss['id']=test_id
X_test_loss_zero = X_test_loss[X_test_loss['pred_loss']==0]

#Subset, transform and calculate based on predicted loss category
X_test_loss_positive = X_test_loss[X_test_loss['pred_loss']>0]
X_test_loss_positive= X_test_loss_positive.drop(axis=1, labels='pred_loss')
test_id2 =X_test_loss_positive['id']
X_test_loss_positive= X_test_loss_positive.drop(axis=1, labels='id')
X_test_loss_positive_scaled = sc.fit_transform(X_test_loss_positive) #Scale

In [30]:
# Calculate predicted losses on defaults
preds_loss_test = np.e**model2.predict(X_test_loss_positive_scaled) ##convert back from logs


In [31]:
# Calculate values across cases and write to csv
X_test_loss_positive['pred_loss'] = preds_loss_test ### Convert the pred loss value into the predicted
X_test_loss_positive['id'] = test_id2
test_data_with_predictions = pd.concat([X_test_loss_positive,X_test_loss_zero])
test_data_with_predictions.rename(columns={'pred_loss':'loss'}, inplace=True)
test_data_with_predictions[['id','loss']].to_csv(save_data_location,index=False)