In [10]:
import pandas as pd
import numpy as np
import os, sys, joblib, math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import KFold

In [2]:
# GLOBALS
LOCAL_ROOT = '/Users/nathvaru/Documents/personal/AV/janatahack_crosssell/'
DATA_DIR = os.path.join(LOCAL_ROOT, 'data')
TRAIN_FN = os.path.join(DATA_DIR, 'train.csv')
TEST_FN = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FN = os.path.join(DATA_DIR, 'sample_submission_iA3afxn.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_FN)
df_test = pd.read_csv(TEST_FN)

## Exploration

In [4]:
print(df_train.shape, '\t', df_test.shape)
df_train.head()

(381109, 12) 	 (127037, 11)


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [8]:
print(df_train.shape[0] == df_train['id'].nunique())
print(df_test.shape[0] == df_test['id'].nunique())

True
True


In [10]:
train_ids = df_train['id'].unique().tolist()
test_ids = df_test['id'].unique().tolist()
set(train_ids).intersection(set(test_ids))

set()

In [11]:
df_train['Gender'].value_counts()

Male      206089
Female    175020
Name: Gender, dtype: int64

In [7]:
print(df_train['Age'].nunique())
print(df_train['Age'].describe())

66
count    381109.000000
mean         38.822584
std          15.511611
min          20.000000
25%          25.000000
50%          36.000000
75%          49.000000
max          85.000000
Name: Age, dtype: float64


In [14]:
df_train['Driving_License'].value_counts()

1    380297
0       812
Name: Driving_License, dtype: int64

In [19]:
df_train['Region_Code'].value_counts()

28.0    106415
8.0      33877
46.0     19749
41.0     18263
15.0     13308
30.0     12191
29.0     11065
50.0     10243
3.0       9251
11.0      9232
36.0      8797
33.0      7654
47.0      7436
35.0      6942
6.0       6280
45.0      5605
37.0      5501
18.0      5153
48.0      4681
14.0      4678
39.0      4644
10.0      4374
21.0      4266
2.0       4038
13.0      4036
7.0       3279
12.0      3198
9.0       3101
27.0      2823
32.0      2787
43.0      2639
17.0      2617
26.0      2587
25.0      2503
24.0      2415
38.0      2026
0.0       2021
16.0      2007
31.0      1960
23.0      1960
20.0      1935
49.0      1832
4.0       1801
34.0      1664
19.0      1535
22.0      1309
40.0      1295
5.0       1279
1.0       1008
44.0       808
42.0       591
52.0       267
51.0       183
Name: Region_Code, dtype: int64

In [18]:
df_train['Vehicle_Age'].value_counts()

1-2 Year     200316
< 1 Year     164786
> 2 Years     16007
Name: Vehicle_Age, dtype: int64

In [20]:
df_train['Annual_Premium'].describe()

count    381109.000000
mean      30564.389581
std       17213.155057
min        2630.000000
25%       24405.000000
50%       31669.000000
75%       39400.000000
max      540165.000000
Name: Annual_Premium, dtype: float64

In [22]:
df_train['Policy_Sales_Channel'].nunique()

155

In [5]:
print(df_train['Vintage'].nunique())
print(df_train['Vintage'].value_counts())

290
256    1418
73     1410
282    1397
158    1394
187    1392
       ... 
205    1235
89     1234
32     1230
224    1227
277    1226
Name: Vintage, Length: 290, dtype: int64


In [26]:
df_train['Response'].value_counts(), df_train['Response'].mean()

(0    334399
 1     46710
 Name: Response, dtype: int64,
 0.12256336113815208)

In [28]:
df_train.isnull().sum(), df_test.isnull().sum()

(id                      0
 Gender                  0
 Age                     0
 Driving_License         0
 Region_Code             0
 Previously_Insured      0
 Vehicle_Age             0
 Vehicle_Damage          0
 Annual_Premium          0
 Policy_Sales_Channel    0
 Vintage                 0
 Response                0
 dtype: int64,
 id                      0
 Gender                  0
 Age                     0
 Driving_License         0
 Region_Code             0
 Previously_Insured      0
 Vehicle_Age             0
 Vehicle_Damage          0
 Annual_Premium          0
 Policy_Sales_Channel    0
 Vintage                 0
 dtype: int64)

## Feature Engineering

In [20]:
# concat
df_train['sample'] = 'train'
df_test['sample'] = 'test'
df_test['Response'] = None

cols = list(df_train.columns)
df = pd.concat([df_train[cols], df_test[cols]], axis=0)
df.reset_index(drop=True, inplace=True)

print(df.shape)
print(df['sample'].value_counts())

(508146, 13)
train    381109
test     127037
Name: sample, dtype: int64


In [21]:
# mapping for categorical vars
from sklearn.preprocessing import LabelEncoder
cat_vars = ['Gender', 'Driving_License', 'Region_Code',
            'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
            'Policy_Sales_Channel']
le_pipes = []
for var in cat_vars:
    le = LabelEncoder()
    le.fit(df[var].values)

    df[var] = le.transform(df[var].values)
    le_pipes.append((var, le))
    
# categorize Age
min_age = int(math.ceil(df['Age'].min()/10.0))*10
max_age = int(math.ceil(df['Age'].max()/10.0))*10 + 10
age_map = list(range(min_age, max_age, 10))
df['Age_bucket'] = None
for i, v in enumerate(age_map):
    mask1 = df['Age_bucket'].isnull()
    mask2 = df['Age'] < v
    df.loc[mask1&mask2, 'Age_bucket'] = i
df['Age_bucket'] = df['Age_bucket'].astype(int)
assert df['Age_bucket'].isnull().sum() == 0
print(df.groupby('Age_bucket')['Age'].describe())
cat_vars.append('Age_bucket')
df['Response'] = df['Response'].astype(float)

               count       mean       std   min   25%   50%   75%   max
Age_bucket                                                             
1           206983.0  24.079040  2.347298  20.0  22.0  24.0  26.0  29.0
2            72344.0  34.543224  2.988319  30.0  32.0  35.0  37.0  39.0
3           102686.0  44.405693  2.785210  40.0  42.0  44.0  47.0  49.0
4            63882.0  53.972136  2.844967  50.0  51.0  54.0  56.0  59.0
5            38767.0  64.174633  2.875002  60.0  62.0  64.0  67.0  69.0
6            22107.0  73.790881  2.758373  70.0  71.0  73.0  76.0  79.0
7             1377.0  80.282498  0.857215  80.0  80.0  80.0  80.0  85.0


In [22]:
# Ratio of Annual_Premium and Vintage
mask = df['Vintage'] != 0
df.loc[mask, 'Annual_Premium_RATIO_Vintage'] = list(map(
    lambda x, y: 1.*x/y, df.loc[mask, 'Annual_Premium'],
    df.loc[mask, 'Vintage']))
df.loc[~mask, 'Annual_Premium_RATIO_Vintage'] = -1
print(df['Annual_Premium_RATIO_Vintage'].describe())

count    508146.000000
mean        363.791248
std         547.524561
min           8.795987
25%         117.325923
50%         194.734799
75%         374.078574
max       33717.285714
Name: Annual_Premium_RATIO_Vintage, dtype: float64


In [25]:
num_vars = ['Age', 'Annual_Premium', 'Vintage',
            'Annual_Premium_RATIO_Vintage']

for cat_var in cat_vars:
    for num_var in num_vars:
        for func in ['mean', 'sum', 'min', 'max', 'nunique', 'std']:
            feat_name = '_'.join([func, num_var, 'per', cat_var])
            print(feat_name)
            df[feat_name] = df.groupby([cat_var])[num_var].transform(func)
            print('\n')

mean_Age_per_Gender


sum_Age_per_Gender


min_Age_per_Gender


max_Age_per_Gender


nunique_Age_per_Gender


std_Age_per_Gender


mean_Annual_Premium_per_Gender


sum_Annual_Premium_per_Gender


min_Annual_Premium_per_Gender


max_Annual_Premium_per_Gender


nunique_Annual_Premium_per_Gender


std_Annual_Premium_per_Gender


mean_Vintage_per_Gender


sum_Vintage_per_Gender


min_Vintage_per_Gender


max_Vintage_per_Gender


nunique_Vintage_per_Gender


std_Vintage_per_Gender


mean_Annual_Premium_RATIO_Vintage_per_Gender


sum_Annual_Premium_RATIO_Vintage_per_Gender


min_Annual_Premium_RATIO_Vintage_per_Gender


max_Annual_Premium_RATIO_Vintage_per_Gender


nunique_Annual_Premium_RATIO_Vintage_per_Gender


std_Annual_Premium_RATIO_Vintage_per_Gender


mean_Age_per_Driving_License


sum_Age_per_Driving_License


min_Age_per_Driving_License


max_Age_per_Driving_License


nunique_Age_per_Driving_License


std_Age_per_Driving_License


mean_Annual_Premium_per_Driving_License


sum_Annua

In [26]:
for cat_var in cat_vars:
    print(cat_var)
    df[cat_var+'_count'] = df.groupby(cat_var)['Age'].transform('count')

Gender
Driving_License
Region_Code
Previously_Insured
Vehicle_Age
Vehicle_Damage
Policy_Sales_Channel
Age_bucket


In [27]:
# interaction terms
from itertools import combinations

iter_cat_vars = list(combinations(cat_vars, 2))

for f1, f2 in iter_cat_vars:
    for num_var in num_vars:
        for func in ['mean', 'sum', 'min', 'max', 'nunique', 'std']:
            feat_name = '_'.join([func, num_var, 'per', f1, f2])
            print(feat_name)
            df[feat_name] = df.groupby([f1, f2])[num_var].transform(func)
            print('\n')

mean_Age_per_Gender_Driving_License


sum_Age_per_Gender_Driving_License


min_Age_per_Gender_Driving_License


max_Age_per_Gender_Driving_License


nunique_Age_per_Gender_Driving_License


std_Age_per_Gender_Driving_License


mean_Annual_Premium_per_Gender_Driving_License


sum_Annual_Premium_per_Gender_Driving_License


min_Annual_Premium_per_Gender_Driving_License


max_Annual_Premium_per_Gender_Driving_License


nunique_Annual_Premium_per_Gender_Driving_License


std_Annual_Premium_per_Gender_Driving_License


mean_Vintage_per_Gender_Driving_License


sum_Vintage_per_Gender_Driving_License


min_Vintage_per_Gender_Driving_License


max_Vintage_per_Gender_Driving_License


nunique_Vintage_per_Gender_Driving_License


std_Vintage_per_Gender_Driving_License


mean_Annual_Premium_RATIO_Vintage_per_Gender_Driving_License


sum_Annual_Premium_RATIO_Vintage_per_Gender_Driving_License


min_Annual_Premium_RATIO_Vintage_per_Gender_Driving_License


max_Annual_Premium_RATIO_Vintage_per_Gende



sum_Annual_Premium_per_Driving_License_Region_Code


min_Annual_Premium_per_Driving_License_Region_Code


max_Annual_Premium_per_Driving_License_Region_Code


nunique_Annual_Premium_per_Driving_License_Region_Code


std_Annual_Premium_per_Driving_License_Region_Code


mean_Vintage_per_Driving_License_Region_Code


sum_Vintage_per_Driving_License_Region_Code


min_Vintage_per_Driving_License_Region_Code


max_Vintage_per_Driving_License_Region_Code


nunique_Vintage_per_Driving_License_Region_Code


std_Vintage_per_Driving_License_Region_Code


mean_Annual_Premium_RATIO_Vintage_per_Driving_License_Region_Code


sum_Annual_Premium_RATIO_Vintage_per_Driving_License_Region_Code


min_Annual_Premium_RATIO_Vintage_per_Driving_License_Region_Code


max_Annual_Premium_RATIO_Vintage_per_Driving_License_Region_Code


nunique_Annual_Premium_RATIO_Vintage_per_Driving_License_Region_Code


std_Annual_Premium_RATIO_Vintage_per_Driving_License_Region_Code


mean_Age_per_Driving_License_Previously_I



nunique_Annual_Premium_per_Region_Code_Previously_Insured


std_Annual_Premium_per_Region_Code_Previously_Insured


mean_Vintage_per_Region_Code_Previously_Insured


sum_Vintage_per_Region_Code_Previously_Insured


min_Vintage_per_Region_Code_Previously_Insured


max_Vintage_per_Region_Code_Previously_Insured


nunique_Vintage_per_Region_Code_Previously_Insured


std_Vintage_per_Region_Code_Previously_Insured


mean_Annual_Premium_RATIO_Vintage_per_Region_Code_Previously_Insured


sum_Annual_Premium_RATIO_Vintage_per_Region_Code_Previously_Insured


min_Annual_Premium_RATIO_Vintage_per_Region_Code_Previously_Insured


max_Annual_Premium_RATIO_Vintage_per_Region_Code_Previously_Insured


nunique_Annual_Premium_RATIO_Vintage_per_Region_Code_Previously_Insured


std_Annual_Premium_RATIO_Vintage_per_Region_Code_Previously_Insured


mean_Age_per_Region_Code_Vehicle_Age


sum_Age_per_Region_Code_Vehicle_Age


min_Age_per_Region_Code_Vehicle_Age


max_Age_per_Region_Code_Vehicle_Age


nuniq



min_Annual_Premium_RATIO_Vintage_per_Previously_Insured_Vehicle_Damage


max_Annual_Premium_RATIO_Vintage_per_Previously_Insured_Vehicle_Damage


nunique_Annual_Premium_RATIO_Vintage_per_Previously_Insured_Vehicle_Damage


std_Annual_Premium_RATIO_Vintage_per_Previously_Insured_Vehicle_Damage


mean_Age_per_Previously_Insured_Policy_Sales_Channel


sum_Age_per_Previously_Insured_Policy_Sales_Channel


min_Age_per_Previously_Insured_Policy_Sales_Channel


max_Age_per_Previously_Insured_Policy_Sales_Channel


nunique_Age_per_Previously_Insured_Policy_Sales_Channel


std_Age_per_Previously_Insured_Policy_Sales_Channel


mean_Annual_Premium_per_Previously_Insured_Policy_Sales_Channel


sum_Annual_Premium_per_Previously_Insured_Policy_Sales_Channel


min_Annual_Premium_per_Previously_Insured_Policy_Sales_Channel


max_Annual_Premium_per_Previously_Insured_Policy_Sales_Channel


nunique_Annual_Premium_per_Previously_Insured_Policy_Sales_Channel


std_Annual_Premium_per_Previously_Insured_P



nunique_Annual_Premium_RATIO_Vintage_per_Vehicle_Damage_Policy_Sales_Channel


std_Annual_Premium_RATIO_Vintage_per_Vehicle_Damage_Policy_Sales_Channel


mean_Age_per_Vehicle_Damage_Age_bucket


sum_Age_per_Vehicle_Damage_Age_bucket


min_Age_per_Vehicle_Damage_Age_bucket


max_Age_per_Vehicle_Damage_Age_bucket


nunique_Age_per_Vehicle_Damage_Age_bucket


std_Age_per_Vehicle_Damage_Age_bucket


mean_Annual_Premium_per_Vehicle_Damage_Age_bucket


sum_Annual_Premium_per_Vehicle_Damage_Age_bucket


min_Annual_Premium_per_Vehicle_Damage_Age_bucket


max_Annual_Premium_per_Vehicle_Damage_Age_bucket


nunique_Annual_Premium_per_Vehicle_Damage_Age_bucket


std_Annual_Premium_per_Vehicle_Damage_Age_bucket


mean_Vintage_per_Vehicle_Damage_Age_bucket


sum_Vintage_per_Vehicle_Damage_Age_bucket


min_Vintage_per_Vehicle_Damage_Age_bucket


max_Vintage_per_Vehicle_Damage_Age_bucket


nunique_Vintage_per_Vehicle_Damage_Age_bucket


std_Vintage_per_Vehicle_Damage_Age_bucket


mean_Annual_Premium

In [38]:
# Rank features
for col in ['Region_Code', 'Policy_Sales_Channel', 'Age_bucket']:
    for func in ['first', 'average', 'max', 'min']:
        feat_name = 'rank_{}_{}'.format(col, func)
        if feat_name in df:
            continue
        print(feat_name)
        df[feat_name] = df.groupby(col)[col].rank(method=func,
                                                  ascending=True)

rank_Age_bucket_first
rank_Age_bucket_average
rank_Age_bucket_max
rank_Age_bucket_min


In [39]:
# Rank features based on interactions
for f1, f2 in [('Policy_Sales_Channel', 'Region_Code')]:
    for func in ['first', 'average', 'max', 'min']:
        feat_name = 'rank_{}_{}_{}'.format(f1, f2, func)
        print(feat_name)
        df[feat_name] = df.groupby([f1, f2])[f1].rank(method=func,
                                                      ascending=True)

rank_Policy_Sales_Channel_Region_Code_first
rank_Policy_Sales_Channel_Region_Code_average
rank_Policy_Sales_Channel_Region_Code_max
rank_Policy_Sales_Channel_Region_Code_min


In [40]:
# Rank features based on interactions
for f1, f2 in [('Policy_Sales_Channel', 'Region_Code')]:
    for col in ['Age', 'Annual_Premium', 'Vintage',
                'Annual_Premium_RATIO_Vintage']:
        for func in ['first', 'average', 'max', 'min']:
            feat_name = 'rank_{}_{}_{}_{}'.format(f1, f2, col, func)
            print(feat_name)
            df[feat_name] = df.groupby([f1, f2])[col].rank(method=func,
                                                           ascending=True)

rank_Policy_Sales_Channel_Region_Code_Age_first
rank_Policy_Sales_Channel_Region_Code_Age_average
rank_Policy_Sales_Channel_Region_Code_Age_max
rank_Policy_Sales_Channel_Region_Code_Age_min
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_first
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_average
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_max
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_min
rank_Policy_Sales_Channel_Region_Code_Vintage_first
rank_Policy_Sales_Channel_Region_Code_Vintage_average
rank_Policy_Sales_Channel_Region_Code_Vintage_max
rank_Policy_Sales_Channel_Region_Code_Vintage_min
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_RATIO_Vintage_first
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_RATIO_Vintage_average
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_RATIO_Vintage_max
rank_Policy_Sales_Channel_Region_Code_Annual_Premium_RATIO_Vintage_min


In [41]:
mask = df['sample'] == 'train'

df_train = df.loc[mask, :]
df_train.reset_index(drop=True, inplace=True)
df_test = df.loc[~mask, :]
df_test.reset_index(drop=True, inplace=True)
df_test.drop('Response', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [42]:
del df

In [43]:
print(df_train.shape, '\t', df_test.shape)

(381109, 919) 	 (127037, 918)


In [44]:
def getCountVar(compute_df, count_df, var_name, count_var):
    """
    compute_df : Data frame for which the count encoding should be done
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    """
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])


def getDVEncodeVar(compute_df, target_df, var_name, target_var):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode(train_df, test_df, cols_to_encode, target_col, encode_type, n_splits=3):
        
    kf = KFold(n_splits=n_splits, shuffle=True,
                               random_state=2020)
    for col in cols_to_encode:
        train_enc_values = np.zeros(train_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, target_col]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            
            if encode_type == 'dv':
                train_enc_values[val_index] =  np.array( 
                    getDVEncodeVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getDVEncodeVar(test_df[[col]], dev_X, col, target_col))
            elif encode_type == 'count':
                train_enc_values[val_index] =  np.array( 
                    getCountVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getCountVar(test_df[[col]], dev_X, col, target_col))
        
        test_enc_values /= n_splits
        train_df[col + "_{}_enc_{}".format(encode_type, target_col)] = train_enc_values
        test_df[col + "_{}_enc_{}".format(encode_type, target_col)] = test_enc_values
        
    return train_df, test_df

In [45]:
df_train, df_test = do_target_encode(df_train, df_test, cat_vars,
                                     'Response', 'dv', 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[col + "_{}_enc_{}".format(encode_type, target_col)] = train_enc_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col + "_{}_enc_{}".format(encode_type, target_col)] = test_enc_values


In [46]:
print(df_train.shape, '\t', df_test.shape)

(381109, 927) 	 (127037, 926)


In [47]:
# drop certain columns
drop_cols = cat_vars + ['sample']
df_train.drop(drop_cols, axis=1, inplace=True)
df_test.drop(drop_cols, axis=1, inplace=True)

print(df_train.shape, '\t', df_test.shape)

(381109, 918) 	 (127037, 917)


In [48]:
# prefix for features
FEAT_PREFIX = 'JHA'
cols = list(df_train.columns)
new_cols = [FEAT_PREFIX + '_'+ col
            if col not in ('id', 'Response') else col for col in cols]
rename_dct = dict(zip(cols, new_cols))
df_train.rename(columns=rename_dct, inplace=True)
df_test.rename(columns=rename_dct, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [49]:
df_train['Response'] = df_train['Response'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Response'] = df_train['Response'].astype(int)


In [32]:
## feature selection based on best model's feature importance
if os.path.exists('imp_feat.txt'):
    feat_imp = pd.read_csv('imp_feat.txt')
    
THRESHOLD = 
selected_feat_cols = feat_imp[feat_imp['fscore'] >= THRESHOLD]['feature'].tolist()
print(len(selected_feat_cols))

142


In [50]:
# modelling
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
import operator
from catboost import Pool, CatBoostClassifier


def create_feature_map(features):
    outfile = open('../model/xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = "auc"
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('../model/xgbmodel.txt', '../model/xgb.fmap',
                         with_stats=True)
        importance = model.get_fscore(fmap='../model/xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest,
                                ntree_limit=model.best_ntree_limit)
    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "binary"
    params['metric'] = "auc"
    params['seed'] = seed_val
    params["max_depth"] = dep
    params["num_leaves"] = 70
    params["min_data_in_leaf"] = 20
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X,
                                num_iteration=model.best_iteration)
    
    if test_X2 is not None:
        pred_test_y2 = model.predict(test_X2,
                                     num_iteration=model.best_iteration)
    else:
        pred_test_y2 = None
        
    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2

In [51]:
# Model building

def trainModel(train_X, train_y, test_X, n_splits, model_name, feats, 
               **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test

    pred_test_full /= n_splits
    auc = metrics.roc_auc_score(train_y, pred_val_full)
    return pred_val_full, auc, pred_test_full, cv_scores

In [52]:
feat_cols = [x for x in list(df_train.columns)
             if x.startswith(FEAT_PREFIX)]
print('# features: ', len(feat_cols))
x_train = df_train[feat_cols]
y_train = df_train['Response']
x_test = df_test[feat_cols]
test_ids = df_test['id'].values
print('shape ', x_train.shape, x_test.shape)

# features:  916
shape  (381109, 916) (127037, 916)


In [53]:
del df_train, df_test

In [54]:
# XGB
params = {'rounds': 500, 'depth': 6, 'eta': 0.05}
%time pred_val_full, auc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "XGB", feat_cols, **params)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.85062	test-auc:0.84939
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.85882	test-auc:0.85611
[40]	train-auc:0.86066	test-auc:0.85698
[60]	train-auc:0.86216	test-auc:0.85762
[80]	train-auc:0.86347	test-auc:0.85813
[100]	train-auc:0.86480	test-auc:0.85835
[120]	train-auc:0.86612	test-auc:0.85858
[140]	train-auc:0.86733	test-auc:0.85865
[160]	train-auc:0.86891	test-auc:0.85870
[180]	train-auc:0.87052	test-auc:0.85864
[200]	train-auc:0.87227	test-auc:0.85860
[220]	train-auc:0.87368	test-auc:0.85845
[240]	train-auc:0.87518	test-auc:0.85828
Stopping. Best iteration:
[153]	train-auc:0.86840

KeyboardInterrupt: 

In [38]:
cv_scores, auc

([0.8586941067246573, 0.8582014216497449, 0.85892184289203],
 0.8585439729403466)

In [39]:
out_df = pd.DataFrame({"id": df_test['id'].values})
out_df["Response"] = pred_test_full
out_df.to_csv("../model/pred_test_v3_xgb.csv", index=False)

In [117]:
# LGB
params = {'rounds': 600, 'depth': 5, 'eta': 0.05}
%time pred_val_full, auc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "LGB", feat_cols, **params)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.8555
[40]	valid_0's auc: 0.856642
[60]	valid_0's auc: 0.857374
[80]	valid_0's auc: 0.857845
[100]	valid_0's auc: 0.858243
[120]	valid_0's auc: 0.858562


[140]	valid_0's auc: 0.858835
[160]	valid_0's auc: 0.859002
[180]	valid_0's auc: 0.85894
[200]	valid_0's auc: 0.859007
[220]	valid_0's auc: 0.85898
[240]	valid_0's auc: 0.858884


[260]	valid_0's auc: 0.858875
[280]	valid_0's auc: 0.8588
[300]	valid_0's auc: 0.858772
Early stopping, best iteration is:
[210]	valid_0's auc: 0.859053
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.855456
[40]	valid_0's auc: 0.856515
[60]	valid_0's auc: 0.857175


[80]	valid_0's auc: 0.857695
[100]	valid_0's auc: 0.857879
[120]	valid_0's auc: 0.858059
[140]	valid_0's auc: 0.85812
[160]	valid_0's auc: 0.858183
[180]	valid_0's auc: 0.858222


[200]	valid_0's auc: 0.858237
[220]	valid_0's auc: 0.858215
[240]	valid_0's auc: 0.858135
[260]	valid_0's auc: 0.858158
[280]	valid_0's auc: 0.858201
[300]	valid_0's auc: 0.858156


Early stopping, best iteration is:
[209]	valid_0's auc: 0.858265
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.855534
[40]	valid_0's auc: 0.856555
[60]	valid_0's auc: 0.857496
[80]	valid_0's auc: 0.858081
[100]	valid_0's auc: 0.858464
[120]	valid_0's auc: 0.858502


[140]	valid_0's auc: 0.858593
[160]	valid_0's auc: 0.858654
[180]	valid_0's auc: 0.858644
[200]	valid_0's auc: 0.858702
[220]	valid_0's auc: 0.858691
[240]	valid_0's auc: 0.858699


[260]	valid_0's auc: 0.858631
[280]	valid_0's auc: 0.858621
[300]	valid_0's auc: 0.858513
Early stopping, best iteration is:
[205]	valid_0's auc: 0.858722
CPU times: user 2min 27s, sys: 38.9 s, total: 3min 6s
Wall time: 3min 24s


In [118]:
cv_scores, auc

([0.8590533600290691, 0.8582650921889274, 0.8587224270604079],
 0.8586083431923248)

In [119]:
out_df = pd.DataFrame({"id": df_test['id'].values})
out_df["Response"] = pred_test_full
out_df.to_csv("../model/pred_test_v1_lgb.csv", index=False)