In [1]:
import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from tqdm.notebook import tqdm
tqdm.pandas()
from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn import metrics
from lightgbm import LGBMClassifier
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.feature_extraction import MinimalFCParameters, ComprehensiveFCParameters
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.utils.class_weight import compute_class_weight

  from pandas import Panel


In [2]:
df = pd.read_csv("../input/jobathon-analytics-vidhya/train.csv")
tdf = pd.read_csv("../input/jobathon-analytics-vidhya/test.csv")
df['train'] = 1
tdf['train'] = 0
df.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,train
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0,1
1,2,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0,1
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1,1
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0,1
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0,1


In [3]:
df = pd.concat([df, tdf])
df.isnull().sum()

ID                             0
City_Code                      0
Region_Code                    0
Accomodation_Type              0
Reco_Insurance_Type            0
Upper_Age                      0
Lower_Age                      0
Is_Spouse                      0
Health Indicator           16718
Holding_Policy_Duration    28854
Holding_Policy_Type        28854
Reco_Policy_Cat                0
Reco_Policy_Premium            0
Response                   21805
train                          0
dtype: int64

In [4]:
df.dtypes

ID                           int64
City_Code                   object
Region_Code                  int64
Accomodation_Type           object
Reco_Insurance_Type         object
Upper_Age                    int64
Lower_Age                    int64
Is_Spouse                   object
Health Indicator            object
Holding_Policy_Duration     object
Holding_Policy_Type        float64
Reco_Policy_Cat              int64
Reco_Policy_Premium        float64
Response                   float64
train                        int64
dtype: object

In [5]:
def return_int(x):
    try:
        return int(float(x))
    except:
        if x == "14+":
            return 15
        else:
            return 0
df['Holding_Policy_Duration'].apply(lambda x: return_int(x)).value_counts(dropna=False)


0     28854
1      6390
15     6227
2      6032
3      5192
4      3976
5      3354
6      2797
7      2309
8      1885
9      1607
10     1146
11      800
13      732
12      709
14      677
Name: Holding_Policy_Duration, dtype: int64

In [6]:
def get_min_per_col_for_cat_col(cat_col, reg_col, cat_val):
    cat_val = int(float(cat_val))
    return df.groupby(cat_col).agg({reg_col: ['min']}).loc[cat_val][0]

def get_max_per_col_for_cat_col(cat_col, reg_col, cat_val):
    cat_val = int(float(cat_val))
    return df.groupby(cat_col).agg({reg_col: ['max']}).loc[cat_val][0]

def get_stand_val_per_cat_col(values):
#     print(values[1])
    cat_col = "Reco_Policy_Cat"
    reg_col = "Reco_Policy_Premium"
    mn = get_min_per_col_for_cat_col(cat_col, reg_col, values[0])
    mx = get_max_per_col_for_cat_col(cat_col, reg_col, values[0])
    return (values[1] - mn) / mx

# df.groupby("Reco_Policy_Cat").agg({'Reco_Policy_Premium': ['min', 'max']}).iloc[12]
# df[["Reco_Policy_Cat", 'Reco_Policy_Premium']].swifter.apply(get_stand_val_per_cat_col, axis=1)

In [7]:

df.groupby("Reco_Policy_Cat").agg({'Reco_Policy_Premium': ['min']}).loc[1][0]

2280.0

In [8]:
df[df['Accomodation_Type'] != 'Owned']['Response'].value_counts(dropna=False)

0.0    17485
NaN     9715
1.0     5446
Name: Response, dtype: int64

In [9]:
target_col = "Response"
cat_cols = ['City_Code', 'Region_Code', 
            'Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse','Health Indicator', 
            'Holding_Policy_Duration', 'Holding_Policy_Type',
               'Reco_Policy_Cat', 'is_joint_and_spouse']
reg_cols = ['Reco_Policy_Premium', 'Upper_Age', 'Lower_Age']
imputed_reg_cols = ['age_diff']

In [None]:

def min_max_scale(col):
    s = col - col.min()
    s /= col.max()
    return s

def dframe_expand(dframe):
    dframe.reset_index(drop=True, inplace=True)
    dframe['age_diff'] = dframe['Upper_Age'] - dframe['Lower_Age']
    dframe["Reco_Policy_Premium"] = dframe["Reco_Policy_Premium"] / dframe["Reco_Policy_Premium"].max()
    
    dframe["age_diff_scaled"] = min_max_scale(dframe["age_diff"])
    dframe["Upper_Age_scaled"] = min_max_scale(dframe["Upper_Age"])
    dframe["Lower_Age_scaled"] = min_max_scale(dframe["Lower_Age"])
    dframe["Reco_Policy_Premium_scaled"] = min_max_scale(dframe["Reco_Policy_Premium"])
    
    dframe["is_joint_and_spouse"] = pd.Series(np.logical_and(dframe.Is_Spouse == 'Yes' , dframe['Reco_Insurance_Type'] == 'Joint'))
    dframe["holding_policy_reg"] = dframe['Holding_Policy_Duration'].apply(lambda x: return_int(x)).value_counts(dropna=False)
    dframe["holding_policy_reg_scaled"] = min_max_scale(dframe["holding_policy_reg"])
    dframe["is_joint_and_spouse"] = pd.Series(np.logical_and(dframe.Is_Spouse == 'Yes' , dframe['Reco_Insurance_Type'] == 'Joint'))
    
    dframe["reco_policy_premium_per_cat_scaled"] = dframe[["Reco_Policy_Cat", 'Reco_Policy_Premium']].progress_apply(get_stand_val_per_cat_col, axis=1)
    dframe["reco_policy_premium_per_cat_scaled_scaled"] = min_max_scale(dframe["reco_policy_premium_per_cat_scaled"])
    return dframe


df = dframe_expand(df)


  0%|          | 0/72687 [00:00<?, ?it/s]

In [None]:
df

In [None]:
target_encoder = LabelEncoder()

label_encoders = {}

def train_encoder(dframe, col, test=False):
    if test:
        dframe[col] = label_encoders[col].transform(dframe[col].fillna("nan").astype(str))

    else:
        label_encoders[col] = LabelEncoder()
        dframe[col] = label_encoders[col].fit_transform(dframe[col].fillna("nan").astype(str))
    
for col in tqdm(cat_cols):
    train_encoder(df, col)


In [None]:
df.dropna(axis=1, how='all', inplace=True)
df.dtypes

In [None]:
df.shape

In [None]:
classes = np.unique(df[df.train == 1][target_col])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[df.train == 1][target_col])
weights,classes

In [None]:
class CatBoost():
# {'depth': 10,
#   'od_wait': 20,
#   'iterations': 700,
#   'learning_rate': 0.1}
    def __init__(self,random_state=22, params={'depth': 12,'od_wait': 20,'iterations': 640,'learning_rate': 0.1}):
        self.model = CatBoostClassifier(random_state = random_state, task_type="GPU", devices='0:1', 
                           eval_metric='AUC', thread_count=12, 
                           cat_features=cat_cols, custom_metric=['AUC:hints=skip_train~false'], metric_period=500,
                           od_type='Iter', loss_function="Logloss", learning_rate=params['learning_rate'], 
                               od_wait=params['od_wait'], iterations=params['iterations'],
                              max_depth=params['depth'], class_weights=dict(zip(classes, weights)))
        

    def train(self, X, y):
        # df[df.train == 1].drop(["train", target_col], axis=1) 
        # df[df.train == 1][target_col]
        
        self.model.fit(X, y)
    
    def predict(self, X_test):
        # df[df.train == 0].drop(["train", target_col], axis=1)
        return np.array(self.model.predict(X_test), dtype=np.int)

In [None]:
# grid = {'max_depth': [12],'od_wait': [20],'iterations': [640],'learning_rate': [0.1]}
# model = CatBoostClassifier(random_state = 2, task_type="GPU", devices='0:1', 
#                            eval_metric='AUC', thread_count=6, 
#                            cat_features=cat_cols, custom_metric=['AUC:hints=skip_train~false'], metric_period=400,
#                            od_type='Iter', loss_function="Logloss", class_weights=dict(zip(classes, weights)))
# grid_search_result = model.grid_search(grid, 
#                                        X=df[df.train == 1].drop(["train", target_col], axis=1) , 
#                                        y=df[df.train == 1][target_col], cv=4, 
#                                        shuffle=True, stratified=True, verbose=False, plot=True, 
#                                        refit=True)
# # preds = np.array(model.predict(df[df.train == 0].drop(["train", target_col], axis=1)), dtype=np.int)

In [None]:
# grid_search_result

In [None]:
def random_state_models(size):
    temp_result = [0] * size
    
    X = df[df.train == 1].drop(["train", target_col], axis=1)
    y = df[df.train == 1][target_col]
    X_test = df[df.train == 0].drop(["train", target_col], axis=1)
    for i in tqdm(range(size)):
        m = CatBoost(random_state=i)
        m.train(X, y)
        temp_result[i] = m.predict(X_test)
    return temp_result

In [None]:
x = random_state_models(20)

In [None]:
preds = np.stack(x)
preds = [np.argmax(np.bincount(preds[:, i])) for i in range(preds.shape[1])]

In [None]:
result = pd.DataFrame({"ID": df[df.train == 0].ID, "Response": preds})

In [None]:
result.head()

In [None]:
result.to_csv("submission.csv", index=False)

In [None]:
from IPython.display import FileLink

FileLink('submission.csv')


In [None]:
result.Response.value_counts() / result.shape[0] * 100

In [None]:
df[df.train == 1].Response.value_counts() / df[df.train == 1].shape[0] * 100