In this notebook we take the approach of randomly undersampling the majority class (0 i.e. happy) to create balanced training data sets.

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
# We import custom utility functions for data processing and random forest training
from process_data import process, create_submission, drop_columns
from train_forests import trainForests, mean_ensemble

In [5]:
data = pd.read_csv('data/train_saldo.csv')

In [4]:
#process(data)

In [6]:
# New feature counting zero entries
original_features = data.columns[:-1]
data.insert(len(original_features),'SumZeros',(data[original_features] == 0).sum(axis=1))

In [7]:
# New feature describing the number of assets
asset_features = [name for name in data.columns if 'ind' in name]
temp = data[asset_features].sum(axis=1)
data.insert(data.shape[1]-1, 'NumAssets', temp)

In [3]:
# New feature counting mean balance of non zero saldo variables
saldo_features = [name for name in data.columns if 'saldo_v' in name]
temp = data[saldo_features]
temp[temp == 0] = np.nan
temp = np.nanmean(temp, axis=1)
data.insert(data.shape[1]-1, 'MeanBalances', temp)       

NameError: name 'data' is not defined

In [8]:
# New feature counting mean balance of non zero saldo variables
saldo_medio_features = [name for name in data.columns if 'saldo_medio' in name]
temp = data[saldo_medio_features]
temp[temp == 0] = np.nan
temp = np.nanmean(temp, axis=1)
data.insert(data.shape[1]-1, 'MeanMeanBalances', temp)    



In [9]:
data = data.fillna(0)

In [10]:
#drop_columns(data, asset_features+saldo_features+saldo_medio_features)

In [7]:
data.head()

Unnamed: 0,mean_saldo_medio_ult3,mean_saldo_medio_ult1,mean_saldo_medio_hace3,mean_saldo_medio_hace2,mean_saldo,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,...,num_venta_var44_hace3,num_venta_var44_ult1,num_var45_hace2,num_var45_hace3,num_var45_ult1,num_var45_ult3,var38,SumZeros,NumAssets,TARGET
0,0.0,0.0,0.0,0.0,0.0,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,39205.17,262,4,0
1,5.344853,5.916563,3.232779,4.090454,5.789128,0,34,0.0,0.0,0.0,...,0,0,0,0,0,0,49278.03,239,9,0
2,5.223912,5.833154,0.648847,3.403428,5.673626,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,67333.77,249,6,0
3,5.294992,5.858761,0.0,3.879004,5.705782,0,37,0.0,195.0,195.0,...,0,0,27,3,18,48,64007.97,219,13,0
4,9.218883,9.468189,0.655791,3.403428,9.99384,0,39,0.0,0.0,0.0,...,0,0,0,0,0,0,117310.979016,231,11,0


In [8]:
def feature_importance(n_trees, data):
    rf = RandomForestClassifier(n_trees)
    rf.fit(data.ix[:,:-1], data.ix[:,-1])
    fimp = rf.feature_importances_
    important = {}
    for idx, name in enumerate(data.ix[:,:-1].columns):
        important[name] = fimp[idx]
    return important

In [13]:
# We can use an initial random forest to select important features
if True:
    threshold = 0.01
    rank = feature_importance(500, data)
    print("We have ranked the features as follows:".format(len(selected)))
    flag = 1
    count = 0
    for a in sorted(selected.keys(), key=selected.get)[::-1]:
        aux = selected[a]
        count+= 1
        if aux < threshold and flag:
            flag = 0
            print('---'*5)
            print('There are {} features above {} '.format(count, threshold))
            print('---'*5)
        print(a,'-->',selected[a])

We have ranked the features as follows:
var38 --> 0.384676788967
var15 --> 0.174755697458
mean_saldo_medio_ult3 --> 0.0349544164114
mean_saldo_medio_hace3 --> 0.0295736058921
SumZeros --> 0.0207480309543
mean_saldo_medio_hace2 --> 0.0206811160103
mean_saldo_medio_ult1 --> 0.0206600961194
num_var45_ult3 --> 0.0188494355936
mean_saldo --> 0.0185687931255
num_var45_hace3 --> 0.0156202927445
num_var45_hace2 --> 0.0142572241725
num_var22_ult3 --> 0.0132212321041
num_var45_ult1 --> 0.0108036242197
num_med_var45_ult3 --> 0.0102241993305
-----There are 15 features above 0.01-----
num_var22_hace2 --> 0.00971274607586
num_var22_hace3 --> 0.00957686562
var36 --> 0.00820081785725
num_var22_ult1 --> 0.00730728062059
num_meses_var39_vig_ult3 --> 0.00687222221618
num_med_var22_ult3 --> 0.0053219443016
NumAssets --> 0.0052336787654
num_meses_var5_ult3 --> 0.00512512462368
imp_op_var41_ult1 --> 0.00472699171541
imp_op_var39_ult1 --> 0.00466057233638
imp_op_var39_comer_ult3 --> 0.00445057648392
imp_op_v

### Our intuition is correct as all the created variables are in the top of the importance rank.

In [9]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [72]:
train.head()

Unnamed: 0,mean_saldo_medio_ult3,mean_saldo_medio_ult1,mean_saldo_medio_hace3,mean_saldo_medio_hace2,mean_saldo,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,...,num_venta_var44_hace3,num_venta_var44_ult1,num_var45_hace2,num_var45_hace3,num_var45_ult1,num_var45_ult3,var38,SumZeros,NumAssets,TARGET
0,5.224154,5.833154,0.727549,3.403428,5.673626,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,76041.96,247,6,0
1,9.252964,9.638293,9.120599,9.617788,9.81415,0,51,0.0,0.0,0.0,...,0,0,9,30,9,48,94024.05,234,9,0
2,0.0,0.0,0.0,0.0,0.0,0,44,0.0,0.0,0.0,...,0,0,0,0,0,0,111306.99,262,4,0
3,5.244495,5.849711,0.749843,3.577026,5.696852,0,57,0.0,0.0,0.0,...,0,0,6,3,0,9,117310.979016,243,6,0
4,0.0,0.0,0.0,0.0,0.0,0,61,0.0,0.0,0.0,...,0,0,0,0,0,0,97021.02,262,4,0


According to the cross-validation analysis, the best results are obtained with N=4, w=1, N_forest=60, n_trees=300

In [10]:
a = 0.25 # a can also be >1 to also oversample the minority class (1 ie. unhappy)
w = 1
N_forest = 60
n_trees = 300

In [11]:
rfs = trainForests(train, a, w, N_forest, n_trees)

In [12]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

In [13]:
Y_prob = mean_ensemble(rfs, X_test)

In [14]:
roc_auc_score(Y_test,Y_prob['arithmetic'])

0.83139857787121751

In [15]:
roc_auc_score(Y_test,Y_prob['geometric'])

0.83336689096482208

Using select_features, we cut down to 19 important features and obtained 0.838 and 0.839.

Without using select_features we ended up with .8348 and .8357 :/

## Meta Ensemble

Now we are going to use the random forest classifiers trained above to create a meta predictor.

In [16]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [95]:
train.head()

Unnamed: 0,mean_saldo_medio_ult3,mean_saldo_medio_ult1,mean_saldo_medio_hace3,mean_saldo_medio_hace2,mean_saldo,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,...,num_venta_var44_hace3,num_venta_var44_ult1,num_var45_hace2,num_var45_hace3,num_var45_ult1,num_var45_ult3,var38,SumZeros,NumAssets,TARGET
0,5.224154,5.833154,0.727549,3.403428,5.673626,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,76041.96,247,6,0
1,9.252964,9.638293,9.120599,9.617788,9.81415,0,51,0.0,0.0,0.0,...,0,0,9,30,9,48,94024.05,234,9,0
2,0.0,0.0,0.0,0.0,0.0,0,44,0.0,0.0,0.0,...,0,0,0,0,0,0,111306.99,262,4,0
3,5.244495,5.849711,0.749843,3.577026,5.696852,0,57,0.0,0.0,0.0,...,0,0,6,3,0,9,117310.979016,243,6,0
4,0.0,0.0,0.0,0.0,0.0,0,61,0.0,0.0,0.0,...,0,0,0,0,0,0,97021.02,262,4,0


In [17]:
X_train, Y_train = train.ix[:,:-1], train['TARGET']

In [97]:
X_train.head()

Unnamed: 0,mean_saldo_medio_ult3,mean_saldo_medio_ult1,mean_saldo_medio_hace3,mean_saldo_medio_hace2,mean_saldo,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,...,num_trasp_var33_out_ult1,num_venta_var44_hace3,num_venta_var44_ult1,num_var45_hace2,num_var45_hace3,num_var45_ult1,num_var45_ult3,var38,SumZeros,NumAssets
0,5.224154,5.833154,0.727549,3.403428,5.673626,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,0,76041.96,247,6
1,9.252964,9.638293,9.120599,9.617788,9.81415,0,51,0.0,0.0,0.0,...,0,0,0,9,30,9,48,94024.05,234,9
2,0.0,0.0,0.0,0.0,0.0,0,44,0.0,0.0,0.0,...,0,0,0,0,0,0,0,111306.99,262,4
3,5.244495,5.849711,0.749843,3.577026,5.696852,0,57,0.0,0.0,0.0,...,0,0,0,6,3,0,9,117310.979016,243,6
4,0.0,0.0,0.0,0.0,0.0,0,61,0.0,0.0,0.0,...,0,0,0,0,0,0,0,97021.02,262,4


In [18]:
for rf in rfs:
    temp = rf.predict(train.ix[:,:-1])
    temp = pd.DataFrame(temp)
    X_train = pd.concat([X_train, temp], axis=1)

In [19]:
train_meta = pd.concat([X_train, train['TARGET']], axis=1)

In [27]:
train_meta.head()

Unnamed: 0,mean_saldo_medio_ult3,mean_saldo_medio_ult1,mean_saldo_medio_hace3,mean_saldo_medio_hace2,mean_saldo,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,...,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,TARGET
0,5.224154,5.833154,0.727549,3.403428,5.673626,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,9.252964,9.638293,9.120599,9.617788,9.81415,0,51,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0,44,0.0,0.0,0.0,...,1,1,1,1,1,1,1,1,1,0
3,5.244495,5.849711,0.749843,3.577026,5.696852,0,57,0.0,0.0,0.0,...,1,0,1,0,1,1,1,1,1,0
4,0.0,0.0,0.0,0.0,0.0,0,61,0.0,0.0,0.0,...,1,1,1,1,1,1,1,1,1,0


In [21]:
X_test, Y_test = test.ix[:,:-1], test['TARGET']

In [22]:
for rf in rfs:
    temp = rf.predict(test.ix[:,:-1])
    temp = pd.DataFrame(temp)
    X_test = pd.concat([X_test, temp], axis=1)

In [23]:
Y_test = test['TARGET']

In [24]:
X_test.head()

Unnamed: 0,mean_saldo_medio_ult3,mean_saldo_medio_ult1,mean_saldo_medio_hace3,mean_saldo_medio_hace2,mean_saldo,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,...,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,5.412605,5.986216,1.881499,4.443568,5.951955,0,45,0.0,709.05,1399.17,...,0,0,0,0,0,0,0,0,0,0
1,9.968666,10.099935,9.753035,10.087061,10.278187,0,60,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,5.252111,5.849711,1.938742,3.577026,5.696852,0,27,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,5.644089,6.013693,4.47396,5.171466,5.805051,0,38,0.0,269.4,754.05,...,0,1,0,1,0,1,1,1,0,1
4,5.224202,5.833154,0.741937,3.403428,5.673626,0,23,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
from sklearn.linear_model import LogisticRegression

In [31]:
# random regularization
cv = {}

In [32]:
for _ in range(20):
    n_trees = np.random.randint(50,300)
    lg = RandomForestClassifier(n_estimators = n_trees)
    lg.fit(X_train, Y_train)
    cv[n_trees] = roc_auc_score(Y_test, lg.predict(X_test))
    print(cv[n_trees], flush=True)

0.526267837978
0.524654645134
0.524757405976
0.525752397273
0.525820904501
0.526336345206
0.52304145229
0.525649636432
0.52571814366
0.527262829276
0.525923665342
0.525444114749
0.525889411728
0.525649636432
0.525581129205
0.525478368363
0.527778269981
0.52571814366
0.524894420431
0.52609656991


In [33]:
for key in sorted(cv.keys(), key=cv.get)[::-1]:
    print(key,cv[key])

75 0.527778269981
227 0.527262829276
137 0.526336345206
209 0.526267837978
65 0.52609656991
229 0.525923665342
199 0.525889411728
177 0.525820904501
283 0.525752397273
231 0.52571814366
121 0.52571814366
217 0.525649636432
225 0.525649636432
109 0.525581129205
205 0.525478368363
158 0.525444114749
299 0.524894420431
127 0.524757405976
69 0.524654645134
97 0.52304145229


### Score analysis

In [None]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]
n = 100
a = 0.25
w = 1
N_forest = 5
n_trees = 5

In [None]:
scores = []
for _ in range(n):
    rfs = trainForests(train, a, w, N_forest,n_trees)
    Y_prob = mean_ensemble(rfs, X_test)
    scores.append(roc_auc_score(Y_test,Y_prob))
scores = pd.DataFrame(scores)

In [None]:
scores.describe()

In [None]:
plt.title("Distribution of scores")
plt.hist(scores)
plt.show()

In [None]:
# If desired, transform probabilities into class labels.
def threshold(Y_prob, threshold = 0.5):
    result = []
    for y in Y_prob:
        if y <= threshold:
            result.append(0)
        else:
            result.append(1)
    return result

In [None]:
# Evaluate class labels
Y_pred = threshold(Y_prob, threshold = 0.5)
_ = eval_classification(test['TARGET'],Y_pred, print_results = True)

In [None]:
# Plot feature importance
def plot_features(forest):  
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    n=len(indices)
    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(n), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(n), indices)
    plt.xlim([-1, n])
    plt.show()

In [None]:
train.head()

# Create Submission

In [None]:
# Retrain forest on the whole 'train.csv' data
rfs = trainForests(data, a, w, N_forest, n_trees)

In [None]:
test = pd.read_csv('data/test.csv')
test_id = test.ix[:,'ID'].values
process(test)

In [None]:
Y_prob = mean_ensemble(rfs,test)

In [None]:
create_submission(test_id, Y_prob['geometric'])

## Ensemble RF and XGBOOST

In [None]:
Y_boost = pd.read_csv('../Kaggle_Santander-master/simplexgbtest.csv')

In [None]:
Y_boost.head()

In [None]:
Y_rf = pd.read_csv('submissions/rforest_ensemble2.csv')

In [None]:
Y_rf.head()

In [None]:
Y_prob = pd.concat([Y_boost,Y_rf.ix[:,'TARGET']], axis=1, ignore_index=True)

In [None]:
Y_prob.rename(columns ={0:'ID', 1:'xgb', 2: 'rfe' }, inplace = True)

In [None]:
# geometric mean ensemble
l = 2 #number of predictors to ensemble
temp = Y_prob.ix[:,1:].product(axis=1)
temp = temp.apply(lambda x: np.power(x, 1./l))
Y_prob['geometric'] = temp

In [None]:
# arithmetic mean ensemble
l = 2 #number of predictors to ensemble
temp = Y_prob[['xgb', 'rfe']].mean(axis=1)
temp = temp.apply(lambda x: np.power(x, 1./l))
Y_prob['arithmetic'] = temp

In [None]:
# difference column
temp = Y_prob['xgb'] - Y_prob['rfe']
Y_prob['xgb - rfe'] = temp

In [None]:
# difference column
temp = Y_prob['geometric'] - Y_prob['arithmetic']
Y_prob['geo - ari'] = temp

In [None]:
Y_prob.head()

In [None]:
plt.title('Differences between XGB and RFE')
plt.hist(Y_prob['xgb - rfe'])
plt.show()

In [None]:
plt.title('Differences between ensembles')
plt.hist(Y_prob['geo - ari'])
plt.show()

In [None]:
create_submission(test_id, Y_prob['arithmetic'])