In [1]:

import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from tqdm.notebook import tqdm
tqdm.pandas()
from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn import metrics
from lightgbm import LGBMClassifier
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.feature_extraction import MinimalFCParameters, ComprehensiveFCParameters
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import swifter
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("train.csv")
tdf = pd.read_csv("test.csv")
df['train'] = 1
tdf['train'] = 0
df.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,train
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0,1
1,2,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0,1
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1,1
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0,1
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0,1


In [3]:
df = pd.concat([df, tdf])
df.isnull().sum()

ID                             0
City_Code                      0
Region_Code                    0
Accomodation_Type              0
Reco_Insurance_Type            0
Upper_Age                      0
Lower_Age                      0
Is_Spouse                      0
Health Indicator           16718
Holding_Policy_Duration    28854
Holding_Policy_Type        28854
Reco_Policy_Cat                0
Reco_Policy_Premium            0
Response                   21805
train                          0
dtype: int64

In [4]:
df.dtypes

ID                           int64
City_Code                   object
Region_Code                  int64
Accomodation_Type           object
Reco_Insurance_Type         object
Upper_Age                    int64
Lower_Age                    int64
Is_Spouse                   object
Health Indicator            object
Holding_Policy_Duration     object
Holding_Policy_Type        float64
Reco_Policy_Cat              int64
Reco_Policy_Premium        float64
Response                   float64
train                        int64
dtype: object

In [5]:
def return_int(x):
    try:
        return int(float(x))
    except:
        if x == "14+":
            return 15
        else:
            return 0
df['Holding_Policy_Duration'].apply(lambda x: return_int(x)).value_counts(dropna=False)


0     28854
1      6390
15     6227
2      6032
3      5192
4      3976
5      3354
6      2797
7      2309
8      1885
9      1607
10     1146
11      800
13      732
12      709
14      677
Name: Holding_Policy_Duration, dtype: int64

In [6]:
def get_min_per_col_for_cat_col(cat_col, reg_col, cat_val):
    cat_val = int(float(cat_val))
    return df.groupby(cat_col).agg({reg_col: ['min']}).loc[cat_val][0]

def get_max_per_col_for_cat_col(cat_col, reg_col, cat_val):
    cat_val = int(float(cat_val))
    return df.groupby(cat_col).agg({reg_col: ['max']}).loc[cat_val][0]

def get_stand_val_per_cat_col(values):
#     print(values[1])
    cat_col = "Reco_Policy_Cat"
    reg_col = "Reco_Policy_Premium"
    mn = get_min_per_col_for_cat_col(cat_col, reg_col, values[0])
    mx = get_max_per_col_for_cat_col(cat_col, reg_col, values[0])
    return (values[1] - mn) / mx

# df.groupby("Reco_Policy_Cat").agg({'Reco_Policy_Premium': ['min', 'max']}).iloc[12]
# df[["Reco_Policy_Cat", 'Reco_Policy_Premium']].swifter.apply(get_stand_val_per_cat_col, axis=1)

In [7]:

df.groupby("Reco_Policy_Cat").agg({'Reco_Policy_Premium': ['min']}).loc[1][0]

2280.0

In [8]:
df[df['Accomodation_Type'] != 'Owned']['Response'].value_counts(dropna=False)

0.0    17485
NaN     9715
1.0     5446
Name: Response, dtype: int64

In [9]:
target_col = "Response"
cat_cols = ['City_Code', 'Region_Code', 
            'Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse','Health Indicator', 
            'Holding_Policy_Duration', 'Holding_Policy_Type',
               'Reco_Policy_Cat', 'is_joint_and_spouse']
reg_cols = ['Reco_Policy_Premium', 'Upper_Age', 'Lower_Age']
imputed_reg_cols = ['age_diff']

In [10]:

def min_max_scale(col):
    s = col - col.min()
    s /= col.max()
    return s

def dframe_expand(dframe):
    dframe.reset_index(drop=True, inplace=True)
    dframe['age_diff'] = dframe['Upper_Age'] - dframe['Lower_Age']
    dframe["Reco_Policy_Premium"] = dframe["Reco_Policy_Premium"] / dframe["Reco_Policy_Premium"].max()
    
    dframe["age_diff_scaled"] = min_max_scale(dframe["age_diff"])
    dframe["Upper_Age_scaled"] = min_max_scale(dframe["Upper_Age"])
    dframe["Lower_Age_scaled"] = min_max_scale(dframe["Lower_Age"])
    dframe["Reco_Policy_Premium_scaled"] = min_max_scale(dframe["Reco_Policy_Premium"])
    
    dframe["is_joint_and_spouse"] = pd.Series(np.logical_and(dframe.Is_Spouse == 'Yes' , dframe['Reco_Insurance_Type'] == 'Joint'))
    dframe["holding_policy_reg"] = dframe['Holding_Policy_Duration'].apply(lambda x: return_int(x)).value_counts(dropna=False)
    dframe["holding_policy_reg_scaled"] = min_max_scale(dframe["holding_policy_reg"])
    dframe["is_joint_and_spouse"] = pd.Series(np.logical_and(dframe.Is_Spouse == 'Yes' , dframe['Reco_Insurance_Type'] == 'Joint'))
    
    dframe["reco_policy_premium_per_cat_scaled"] = dframe[["Reco_Policy_Cat", 'Reco_Policy_Premium']].progress_apply(get_stand_val_per_cat_col, axis=1)
    dframe["reco_policy_premium_per_cat_scaled_scaled"] = min_max_scale(dframe["reco_policy_premium_per_cat_scaled"])
    return dframe


df = dframe_expand(df)


  0%|          | 0/72687 [00:00<?, ?it/s]

In [11]:
df

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,...,age_diff,age_diff_scaled,Upper_Age_scaled,Lower_Age_scaled,Reco_Policy_Premium_scaled,is_joint_and_spouse,holding_policy_reg,holding_policy_reg_scaled,reco_policy_premium_per_cat_scaled,reco_policy_premium_per_cat_scaled_scaled
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,...,0,0.000000,0.240000,0.266667,0.216466,False,28854.0,0.976537,0.216557,0.228559
1,2,C5,1117,Owned,Joint,75,22,No,X2,,...,53,0.898305,0.760000,0.080000,0.647798,False,6390.0,0.197997,0.662763,0.699493
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,...,0,0.000000,0.186667,0.213333,0.121025,False,6032.0,0.185590,0.112112,0.118325
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,...,4,0.067797,0.453333,0.426667,0.356999,False,5192.0,0.156477,0.352085,0.371598
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,...,0,0.000000,0.346667,0.373333,0.188505,False,3976.0,0.114334,0.202995,0.214245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72682,72683,C3,1044,Owned,Individual,45,45,No,X1,4.0,...,0,0.000000,0.360000,0.386667,0.313688,False,,,0.306525,0.323513
72683,72684,C4,266,Owned,Individual,59,59,No,X5,6.0,...,0,0.000000,0.546667,0.573333,0.439465,False,,,0.433537,0.457563
72684,72685,C12,2470,Owned,Individual,74,74,No,X3,,...,0,0.000000,0.746667,0.773333,0.358279,False,,,0.422570,0.445989
72685,72686,C10,1676,Rented,Individual,25,25,No,X4,3.0,...,0,0.000000,0.093333,0.120000,0.215095,False,,,0.207776,0.219291


In [12]:
target_encoder = LabelEncoder()

label_encoders = {}

def train_encoder(dframe, col, test=False):
    if test:
        dframe[col] = label_encoders[col].transform(dframe[col].fillna("nan").astype(str))

    else:
        label_encoders[col] = LabelEncoder()
        dframe[col] = label_encoders[col].fit_transform(dframe[col].fillna("nan").astype(str))
    
for col in tqdm(cat_cols):
    train_encoder(df, col)


  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
df.dropna(axis=1, how='all', inplace=True)
df.dtypes

ID                                             int64
City_Code                                      int64
Region_Code                                    int64
Accomodation_Type                              int64
Reco_Insurance_Type                            int64
Upper_Age                                      int64
Lower_Age                                      int64
Is_Spouse                                      int64
Health Indicator                               int64
Holding_Policy_Duration                        int64
Holding_Policy_Type                            int64
Reco_Policy_Cat                                int64
Reco_Policy_Premium                          float64
Response                                     float64
train                                          int64
age_diff                                       int64
age_diff_scaled                              float64
Upper_Age_scaled                             float64
Lower_Age_scaled                             f

In [14]:

df.shape

(72687, 25)

In [15]:

param_test = {
 'max_depth':[8, 9, 7, 6, 5],
 'min_child_weight':[5,7,8, 6],
 'learning_rate': [.3, .2, .1],
 'n_estimators': [60, 40, 50]
}
# model = GridSearchCV(
#     estimator=xgb.XGBClassifier(gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), verbose=10,
#     param_grid=param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=4)
model = xgb.XGBClassifier(max_depth=10, random_seed=10, objective= 'binary:logistic', 
                          nthread=4, n_estimator=500)

In [16]:
X = df[df.train == 1].drop(["train", target_col], axis=1)
y = df[df.train == 1][target_col]
X_test = df[df.train == 0].drop(["train", target_col], axis=1)
model.fit(X, y)

Parameters: { n_estimator, random_seed } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimator=500, n_estimators=100, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
preds = model.predict(X_test)

In [18]:
# model.best_params_

In [19]:
def random_state_models(size):
    temp_result = [0] * size
    params = grid_search_result['params']
    for i in tqdm(range(size)):
        m = CatBoostClassifier(random_state = i, task_type="GPU", devices='0:1', 
                           eval_metric='AUC', thread_count=12, 
                           cat_features=cat_cols, custom_metric=['AUC:hints=skip_train~false'], metric_period=400,
                           od_type='Iter', loss_function="Logloss", learning_rate=params['learning_rate'], 
                               od_wait=params['od_wait'], iterations=params['iterations'],
                              max_depth=params['depth'])
        m.fit(df[df.train == 1].drop(["train", target_col], axis=1), df[df.train == 1][target_col])
        temp_result[i] = np.array(m.predict(df[df.train == 0].drop(["train", target_col], axis=1)), dtype=np.int)
    return temp_result

In [20]:
# x = random_state_models(1)

In [21]:
# preds = np.stack(x)
# preds = [np.argmax(np.bincount(preds[:, i])) for i in range(preds.shape[1])]

In [22]:
preds.shape

(21805,)

In [23]:
preds

array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
result = pd.DataFrame({"ID": df[df.train == 0].ID, "Response": preds.astype(np.int)})

In [25]:
result.head()

Unnamed: 0,ID,Response
50882,50883,0
50883,50884,0
50884,50885,0
50885,50886,0
50886,50887,0


In [26]:
result.to_csv("submission.csv", index=False)

In [27]:
from IPython.display import FileLink

FileLink('submission.csv')


In [28]:
result.Response.value_counts() / result.shape[0] * 100

0    92.286173
1     7.713827
Name: Response, dtype: float64

In [29]:
df[df.train == 1].Response.value_counts() / df[df.train == 1].shape[0] * 100

0.0    76.005267
1.0    23.994733
Name: Response, dtype: float64