In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import os, sys, joblib, math, time
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from itertools import combinations
import xgboost as xgb
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import operator

# GLOBALS
LOCAL_ROOT = '/kaggle/input/jha-cross-sell/'
OUT_DIR = '/kaggle/working/'
TRAIN_FN = os.path.join(LOCAL_ROOT, 'train.csv')
TEST_FN = os.path.join(LOCAL_ROOT, 'test.csv')
SUBMISSION_FN = os.path.join(LOCAL_ROOT, 'sample_submission_iA3afxn.csv')


def getCountVar(compute_df, count_df, var_name, count_var):
    """
    compute_df : Data frame for which the count encoding should be done
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    """
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])


def getDVEncodeVar(compute_df, target_df, var_name, target_var):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode(train_df, test_df, cols_to_encode, target_col, encode_type, n_splits=3):
        
    kf = KFold(n_splits=n_splits, shuffle=True,
                               random_state=2020)
    for col in cols_to_encode:
        train_enc_values = np.zeros(train_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, target_col]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            
            if encode_type == 'dv':
                train_enc_values[val_index] =  np.array( 
                    getDVEncodeVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getDVEncodeVar(test_df[[col]], dev_X, col, target_col))
            elif encode_type == 'count':
                train_enc_values[val_index] =  np.array( 
                    getCountVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getCountVar(test_df[[col]], dev_X, col, target_col))
        
        test_enc_values /= n_splits
        train_df[col + "_{}_enc_{}".format(encode_type, target_col)] = train_enc_values
        test_df[col + "_{}_enc_{}".format(encode_type, target_col)] = test_enc_values
        
    return train_df, test_df


def create_feature_map(features):
    out_fn = os.path.join(OUT_DIR, 'xgb.fmap')
    outfile = open(out_fn, 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = "auc"
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        xgb_model_fn = os.path.join(OUT_DIR, 'xgbmodel.txt')
        xgb_fmap_fn = os.path.join(OUT_DIR, 'xgb.fmap')
        xgb_imp_fn = os.path.join(OUT_DIR, 'imp_feat.txt')
        model.dump_model(xgb_model_fn, xgb_fmap_fn, with_stats=True)
        importance = model.get_fscore(fmap=xgb_fmap_fn)
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv(xgb_imp_fn, index=False)

    pred_test_y = model.predict(xgtest,
                                ntree_limit=model.best_ntree_limit)
    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "binary"
    params['metric'] = "auc"
    params['seed'] = seed_val
    params["max_depth"] = dep
    params["num_leaves"] = 70
    params["min_data_in_leaf"] = 20
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X,
                                num_iteration=model.best_iteration)
    
    if test_X2 is not None:
        pred_test_y2 = model.predict(test_X2,
                                     num_iteration=model.best_iteration)
    else:
        pred_test_y2 = None
        
    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def trainModel(train_X, train_y, test_X, n_splits, model_name, feats, **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test

    pred_test_full /= n_splits
    auc = metrics.roc_auc_score(train_y, pred_val_full)
    return pred_val_full, auc, pred_test_full, cv_scores


if __name__ == '__main__':
    print('JHA Cross Sell Competition...\n')
    
    print('read data\n')
    df_train = pd.read_csv(TRAIN_FN)
    df_test = pd.read_csv(TEST_FN)
    
    print('concat train and test\n')
    df_train['sample'] = 'train'
    df_test['sample'] = 'test'
    df_test['Response'] = None

    cols = list(df_train.columns)
    df = pd.concat([df_train[cols], df_test[cols]], axis=0)
    df.reset_index(drop=True, inplace=True)
    
    print('mapping for categorical vars\n')
    cat_vars = ['Gender', 'Driving_License', 'Region_Code',
                'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
                'Policy_Sales_Channel']
    le_pipes = []
    for var in cat_vars:
        le = LabelEncoder()
        le.fit(df[var].values)

        df[var] = le.transform(df[var].values)
        le_pipes.append((var, le))

    print('categorize Age\n')
    min_age = int(math.ceil(df['Age'].min()/10.0))*10
    max_age = int(math.ceil(df['Age'].max()/10.0))*10 + 10
    age_map = list(range(min_age, max_age, 10))
    df['Age_bucket'] = None
    for i, v in enumerate(age_map):
        mask1 = df['Age_bucket'].isnull()
        mask2 = df['Age'] < v
        df.loc[mask1&mask2, 'Age_bucket'] = i
    
    df['Age_bucket'] = df['Age_bucket'].astype(int)
    assert df['Age_bucket'].isnull().sum() == 0
    print(df.groupby('Age_bucket')['Age'].describe())
    cat_vars.append('Age_bucket')
    df['Response'] = df['Response'].astype(float)
    
    print('Ratio of Annual_Premium and Vintage\n')
    mask = df['Vintage'] != 0
    df.loc[mask, 'Annual_Premium_RATIO_Vintage'] = list(map(
        lambda x, y: 1.*x/y, df.loc[mask, 'Annual_Premium'],
        df.loc[mask, 'Vintage']))
    df.loc[~mask, 'Annual_Premium_RATIO_Vintage'] = -1
    print(df['Annual_Premium_RATIO_Vintage'].describe())
    
    print('encoding cat_vars by aggregating numeric vars\n')
    num_vars = ['Age', 'Annual_Premium', 'Vintage', 'Annual_Premium_RATIO_Vintage']
    for cat_var in cat_vars:
        for num_var in num_vars:
            for func in ['mean', 'sum', 'min', 'max', 'nunique', 'std']:
                feat_name = '_'.join([func, num_var, 'per', cat_var])
                print(feat_name)
                df[feat_name] = df.groupby([cat_var])[num_var].transform(func)
                print('\n')
                
    print('count encodings for cat vars\n')
    for cat_var in cat_vars:
        print(cat_var)
        df[cat_var+'_count'] = df.groupby(cat_var)['Age'].transform('count')
        
    print('encoding cat_vars interactions by aggregating numeric vars\n')
    iter_cat_vars = list(combinations(cat_vars, 2))
    for f1, f2 in iter_cat_vars:
        for num_var in num_vars:
            for func in ['mean', 'sum', 'min', 'max', 'nunique', 'std']:
                feat_name = '_'.join([func, num_var, 'per', f1, f2])
                print(feat_name)
                df[feat_name] = df.groupby([f1, f2])[num_var].transform(func)
                print('\n')
    
    print('rank features\n')
    for col in ['Region_Code', 'Policy_Sales_Channel', 'Age_bucket']:
        for func in ['first', 'average', 'max', 'min']:
            feat_name = 'rank_{}_{}'.format(col, func)
            if feat_name in df:
                continue
            print(feat_name)
            df[feat_name] = df.groupby(col)[col].rank(method=func, ascending=True)
            
    print('rank features based on interactions\n')
    for f1, f2 in [('Policy_Sales_Channel', 'Region_Code')]:
        for func in ['first', 'average', 'max', 'min']:
            feat_name = 'rank_{}_{}_{}'.format(f1, f2, func)
            print(feat_name)
            df[feat_name] = df.groupby([f1, f2])[f1].rank(method=func, ascending=True)
            
    print('rank features based on interactions and numeric vars aggregation\n')
    for f1, f2 in [('Policy_Sales_Channel', 'Region_Code')]:
        for col in ['Age', 'Annual_Premium', 'Vintage', 'Annual_Premium_RATIO_Vintage']:
            for func in ['first', 'average', 'max', 'min']:
                feat_name = 'rank_{}_{}_{}_{}'.format(f1, f2, col, func)
                print(feat_name)
                df[feat_name] = df.groupby([f1, f2])[col].rank(method=func, ascending=True)
                
    print('split df into train and test\n')
    mask = df['sample'] == 'train'
    df_train = df.loc[mask, :]
    df_train.reset_index(drop=True, inplace=True)
    df_test = df.loc[~mask, :]
    df_test.reset_index(drop=True, inplace=True)
    df_test.drop('Response', axis=1, inplace=True)
    
    print('release memory\n')
    del df
    
    print('DV encodings\n')
    df_train, df_test = do_target_encode(df_train, df_test, cat_vars, 'Response', 'dv', 3)
    
    print('shape ', df_train.shape, '\t', df_test.shape)
    
    print('drop certain columns\n')
    drop_cols = cat_vars + ['sample']
    df_train.drop(drop_cols, axis=1, inplace=True)
    df_test.drop(drop_cols, axis=1, inplace=True)

    print('shape ', df_train.shape, '\t', df_test.shape)
    
    print('prefix for features\n')
    FEAT_PREFIX = 'JHA'
    cols = list(df_train.columns)
    new_cols = [FEAT_PREFIX + '_'+ col if col not in ('id', 'Response') else col for col in cols]
    rename_dct = dict(zip(cols, new_cols))
    df_train.rename(columns=rename_dct, inplace=True)
    df_test.rename(columns=rename_dct, inplace=True)
    
    df_train['Response'] = df_train['Response'].astype(int)
    
    print('prepare data for modelling\n')
    feat_cols = [x for x in list(df_train.columns) if x.startswith(FEAT_PREFIX)]
    print('# features: ', len(feat_cols))
    x_train = df_train[feat_cols]
    y_train = df_train['Response']
    x_test = df_test[feat_cols]
    test_ids = df_test['id'].values
    print('shape ', x_train.shape, x_test.shape)
    
    print('release memory\n')
    del df_train, df_test
    
    print('modelling begins...\n')
    print('XGB\n')
    params = {'rounds': 500, 'depth': 6, 'eta': 0.05}
    start = time.time()
    pred_val_full, auc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "XGB", feat_cols, **params)
    print('time taken: %0.2f' % (time.time() - start))
    
    print('cv scores: ', cv_scores)
    print('Final CV AUC: ', auc)
    
    print('submission\n')
    out_df = pd.DataFrame({"id": test_ids})
    out_df["Response"] = pred_test_full
    out_fn = os.path.join(OUT_DIR, 'pred_test_v4_xgb.csv')
    out_df.to_csv(out_fn, index=False)
    