### Porto Seguro 

3rd Largest Insurance Company in Brasil

---

__Goal__ - To produce the likelyhood a driver will initiate an insurance claim.

__Notes__ 
- Though this is an insurance claim problem, the data is anonomyzed in such a way that no features are known. Therefore, there is no 'domain expertise' easily used to construct features unless there is a labeled dataset somewhere you could compare.
- Classes are highly imbalanced


In [153]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import *
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

In [35]:
train.head()

In [5]:
train.shape

(595212, 59)

In [24]:
reg, cat, bi, oth = 0, 0, 0, 0

for c in train.columns:
    if '_cat' in c:
        cat += 1
    elif '_bin' in c:
        bi += 1
    elif '_reg' in c:
        reg += 1
    else:
        oth += 1

print('{} categoricals, {} binaries, {} regressors, {} others'.format(cat,bi,reg,oth))

14 categoricals, 17 binaries, 3 regressors, 25 others


---
Porto replaced nulls with -1's

In [7]:
train_nan = train.copy()
train_nan = train_nan.replace(-1,np.NaN)

In [16]:
for c in train_nan.columns:
    if train_nan[c].isnull().sum() > 0:
        print('{}\t\t\t{} nulls'.format(c,train_nan[c].isnull().sum()))

ps_ind_02_cat			216 nulls
ps_ind_04_cat			83 nulls
ps_ind_05_cat			5809 nulls
ps_reg_03			107772 nulls
ps_car_01_cat			107 nulls
ps_car_02_cat			5 nulls
ps_car_03_cat			411231 nulls
ps_car_05_cat			266551 nulls
ps_car_07_cat			11489 nulls
ps_car_09_cat			569 nulls
ps_car_11			5 nulls
ps_car_12			1 nulls
ps_car_14			42620 nulls


---
Target variable is unbalanced. 

In [19]:
train['target'].value_counts()/train['target'].count()

0    0.963552
1    0.036448
Name: target, dtype: float64

---
Create scoring metric for sklearn

In [103]:
def ginic(actual, pred):
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
    return giniSum / n
 
def gini_normalizedc(a, p):
    if p.ndim == 2:
        p = p[:,1] 
    return ginic(a, p) / ginic(a, a)

In [134]:
ginic(y_valid.values,y_pred)

-0.12655528759931781

In [104]:
gini_sklearn = metrics.make_scorer(gini_normalizedc, True, True)

Split

In [110]:
x, y = train.drop(['id','target'],axis=1), train['target']
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(train.drop(['id','target'],axis=1), train['target'], test_size=.3)

Model

In [115]:
rf = ensemble.RandomForestClassifier(n_estimators=400, max_depth=8, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)

In [136]:
lr = linear_model.Lasso(.01)

In [142]:
lg = linear_model.LogisticRegression(penalty='l1',C=.01)

In [143]:
kfold = model_selection.KFold(n_splits=2, random_state=4)
score = model_selection.cross_val_score(lg,x_train,y_train,scoring='roc_auc',cv=kfold)
np.mean(score)

0.60873751495164474

In [141]:
np.mean(score)*2-1

0.23894280031456105

In [118]:
%%time
rf.fit(x_train,y_train)

CPU times: user 10min 26s, sys: 6.13 s, total: 10min 32s
Wall time: 1min 27s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=0.2, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=4,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=400, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [119]:
y_pred = rf.predict_proba(x_valid)[:,1]

---
Maybe multiply by 2 to approximate the score on the LB?

In [127]:
gini(y_valid,y_pred)*2

0.25311057519863561

Grid

In [None]:
tuned_parameters = {}
tuned_parameters['C'] = [.001,.01,.1,1,10]
tuned_parameters['penalty'] = ['l1']

In [None]:
grid = model_selection.GridSearchCV(lg, tuned_parameters, cv=2)

In [None]:
grid_scores = grid.fit(x_train,y_train)

Keras

In [151]:
import keras

In [None]:
from keras.models import Sequential()
from keras.layers import Dense, BatchNormalization, 

In [25]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=400, max_depth=8, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)
rf.fit(train.drop(['id', 'target'],axis=1), train.target)
features = train.drop(['id', 'target'],axis=1).columns.values
print("----- Training Done -----")

----- Training Done -----


In [121]:
y_pred.shape

(178564,)

In [123]:
test = pd.read_csv('./data/test.csv')

In [124]:
X_test = test.drop(['id'],axis=1)

test['target'] = rf.predict_proba(X_test)[:,1]

In [125]:
test[['id','target']].to_csv('./submissions/3_sub.csv',index=False, float_format='%.5f')

In [45]:
sample = pd.read_csv('./data/sample_submission.csv',index_col=None)

In [69]:
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

In [70]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)