In [None]:
import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from tqdm.notebook import tqdm
tqdm.pandas()
from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn import metrics
from lightgbm import LGBMClassifier
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

In [None]:
df = pd.read_csv("../input/jobathon-analytics-vidhya/train.csv")
tdf = pd.read_csv("../input/jobathon-analytics-vidhya/test.csv")
df['train'] = 1
tdf['train'] = 0
df.head()

In [None]:
df = pd.concat([df, tdf])
df.isnull().sum()

In [None]:
# df.drop("Region_Code", axis = 1, inplace=True)

In [None]:
target_col = "Response"
cat_cols = ['City_Code', 'Region_Code', 
            'Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse','Health Indicator', 
            'Holding_Policy_Duration', 'Holding_Policy_Type',
               'Reco_Policy_Cat']
reg_cols = ['Reco_Policy_Premium', 'Upper_Age', 'Lower_Age']
imputed_reg_cols = ['age_diff']

In [None]:
def dframe_expand(dframe):
    dframe['age_diff'] = dframe['Upper_Age'] - dframe['Lower_Age']
#     df_extra = extract_features(dframe.drop(columns=cat_cols + [target_col]), column_id = 'ID', show_warnings=False, 
#                                 impute_function=impute, disable_progressbar=False,
#                             n_jobs=2)
# #     print(df_extra)
#     for col in df_extra.columns:
#         dframe[col] = df_extra[col]
    dframe["Reco_Policy_Premium"] = dframe["Reco_Policy_Premium"] / dframe["Reco_Policy_Premium"].max()
    dframe["age_diff"] = dframe["age_diff"] / dframe["age_diff"].max()
    
    return dframe

df = dframe_expand(df)



In [None]:
df

In [None]:
target_encoder = LabelEncoder()

label_encoders = {}

def train_encoder(dframe, col, test=False):
    if test:
        dframe[col] = label_encoders[col].transform(dframe[col].fillna("nan").astype(str))

    else:
        label_encoders[col] = LabelEncoder()
        dframe[col] = label_encoders[col].fit_transform(dframe[col].fillna("nan").astype(str))
    
for col in tqdm(cat_cols):
    train_encoder(df, col)


In [None]:
df.dropna(axis=1, how='all', inplace=True)
df.dtypes

In [None]:
grid = {'learning_rate': [.1],#[.05, 0.1, .2],
        'max_depth': [14], #[10 ,12, 14],
        'n_estimators': [600, 400, 500, 550]
       }

model = CatBoostClassifier(random_state = 22, task_type="GPU", devices='0:1', 
                           eval_metric='AUC', thread_count=2, 
                           cat_features=cat_cols, custom_metric=['AUC:hints=skip_train~false'], metric_period=50,
                          od_type='Iter',od_wait=10, loss_function="Logloss")

In [None]:
df.shape

In [None]:
grid_search_result = model.grid_search(grid, 
                                       X=df[df.train == 1].drop(["train", target_col], axis=1) , 
                                       y=df[df.train == 1][target_col], cv=4, 
                                       shuffle=True, stratified=True, verbose=False, plot=True, 
                                       refit=True)

In [None]:
grid_search_result

In [None]:
result = pd.DataFrame({"ID": df[df.train == 0].ID, "Response": model.predict(df[df.train == 0].drop(["train", target_col], axis=1))})

In [None]:
result.head()

In [None]:
result.to_csv("submission.csv", index=False)

In [None]:
from IPython.display import FileLink

FileLink('submission.csv')


In [None]:
result.Response.value_counts() / result.shape[0] * 100

In [None]:
df.Response.value_counts() / df.shape[0] * 100