In [1]:
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import tensorflow as tf 
import os 

import warnings
warnings.filterwarnings("ignore")

SEED = 22022022

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    tf.random.set_seed(SEED)

random_seed(SEED)

In [2]:
Train = pd.read_csv('/kaggle/input/jobathonjan2023/train.csv')
Test = pd.read_csv('/kaggle/input/jobathonjan2023/test.csv')
SampleSubmission = pd.read_csv('/kaggle/input/jobathonjan2023/sample_submission.csv')

In [3]:
Train.head(2)

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,2,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400


In [4]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=3, random_state=0).fit(Train[['cltv']])
Train['cltv_gmm'] = gm.predict(Train[['cltv']])

In [5]:
Train.drop(columns=['id'
                   ],inplace=True)
Test.drop(columns=['id'
                  ],inplace=True)

In [6]:
from sklearn.utils import shuffle
Train = shuffle(Train,random_state=SEED)
Train.reset_index(drop=True,inplace=True)

In [7]:
Y = Train.pop('cltv')

In [8]:
Train = pd.get_dummies(Train)
Test = pd.get_dummies(Test)

In [9]:
X = Train [['marital_status', 'vintage', 'claim_amount',
       'gender_Female', 'gender_Male', 'area_Rural', 'area_Urban',
       'qualification_Bachelor', 'qualification_High School',
       'qualification_Others', 'income_2L-5L', 'income_5L-10L', 'income_<=2L',
       'income_More than 10L', 'num_policies_1', 'num_policies_More than 1',
       'policy_A', 'policy_B', 'policy_C', 'type_of_policy_Gold',
       'type_of_policy_Platinum', 'type_of_policy_Silver']]

In [10]:
from xgboost import XGBRegressor,XGBRFRegressor
from catboost import CatBoostRegressor

from lightgbm import LGBMRegressor

from sklearn.model_selection import StratifiedKFold,KFold

skf = StratifiedKFold(n_splits=10,shuffle=False)

XGB_preds_Test = []

for train_index, test_index in skf.split(X,Train['cltv_gmm']):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    
    model = CatBoostRegressor(iterations = 2000,eval_metric='R2')
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)]
              ,verbose=50,early_stopping_rounds=500)
    
    XGB_preds_Test.append(model.predict(np.array(Test)))

Learning rate set to 0.066441
0:	learn: 0.0195086	test: 0.0188311	best: 0.0188311 (0)	total: 72.7ms	remaining: 2m 25s
50:	learn: 0.1627769	test: 0.1602348	best: 0.1602348 (50)	total: 1.59s	remaining: 1m
100:	learn: 0.1668832	test: 0.1617284	best: 0.1617284 (100)	total: 2.19s	remaining: 41.2s
150:	learn: 0.1699749	test: 0.1619670	best: 0.1620678 (139)	total: 2.76s	remaining: 33.8s
200:	learn: 0.1732633	test: 0.1615954	best: 0.1620678 (139)	total: 3.32s	remaining: 29.7s
250:	learn: 0.1760778	test: 0.1604520	best: 0.1620678 (139)	total: 3.88s	remaining: 27s
300:	learn: 0.1785477	test: 0.1595517	best: 0.1620678 (139)	total: 4.44s	remaining: 25.1s
350:	learn: 0.1810027	test: 0.1585266	best: 0.1620678 (139)	total: 5.01s	remaining: 23.5s
400:	learn: 0.1833215	test: 0.1577897	best: 0.1620678 (139)	total: 5.58s	remaining: 22.2s
450:	learn: 0.1856292	test: 0.1574651	best: 0.1620678 (139)	total: 6.17s	remaining: 21.2s
500:	learn: 0.1878863	test: 0.1567837	best: 0.1620678 (139)	total: 6.74s	remain

In [11]:
XGB_preds_Test

[array([ 92582.7215196 , 129474.27870223,  94345.04776311, ...,
        109260.41040554, 106894.49150007, 115872.77172309]),
 array([ 93891.70408006, 129205.92005041,  92371.72991224, ...,
        108150.15241256, 109591.63383698, 115483.18819109]),
 array([ 94021.18822875, 129219.67117449,  94275.31873418, ...,
        106522.06671802, 103782.58518292, 114265.64056645]),
 array([ 92784.64118682, 126679.97077822,  94366.02197752, ...,
        107242.35901161, 106360.85653982, 116787.21398236]),
 array([ 93521.29233979, 128662.71418697,  92844.13356578, ...,
        106825.10189379, 105487.29580387, 116044.50179164]),
 array([ 93664.0181137 , 128598.97780271,  93703.80218655, ...,
        109132.12524029, 106985.26427476, 116808.31487539]),
 array([ 94860.55344834, 127956.92732927,  94869.85136849, ...,
        106966.35820916, 105844.79681945, 116536.58880398]),
 array([ 94013.44636004, 129261.16669905,  94244.55979087, ...,
        107705.10266064, 108807.15254743, 115726.67907139]),


In [12]:
final = np.mean(XGB_preds_Test,0)

In [13]:
SampleSubmission['cltv'] = final

In [14]:
SampleSubmission['cltv'].min()

42009.53292497842

In [16]:
SampleSubmission.to_csv('CB_5.csv',index=False)