<a href="https://colab.research.google.com/github/vyome9248/Analytics-Vidhya-Jobathon-Rank-9-Private-Leaderboard/blob/main/Analytics_Vidhya_Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Prerequisite Libraries**



In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [None]:
TARGET = 'Is_Lead'
SEED = 2021


In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train_s3TEQDk.csv')
test = pd.read_csv('/content/drive/MyDrive/test_mSzZ8RL.csv')
example = pd.read_csv('/content/drive/MyDrive/sample_submission_eyYijxG.csv')

In [None]:
combined = pd.concat([train,test]).reset_index(drop = True)
combined

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0.0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0.0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0.0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0.0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0.0
...,...,...,...,...,...,...,...,...,...,...,...
351032,DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes,
351033,CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No,
351034,HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No,
351035,2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes,


In [None]:
combined['Credit_Product'] = combined['Credit_Product'].fillna('Voila')
combined

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0.0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0.0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0.0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0.0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0.0
...,...,...,...,...,...,...,...,...,...,...,...
351032,DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes,
351033,CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No,
351034,HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No,
351035,2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes,


In [None]:
combined.drop(columns=['ID'],inplace=True)

In [None]:
label_cols = ['Region_Code','Gender']
onehot_cols = ['Occupation', 'Credit_Product','Is_Active','Channel_Code']
numerical_cols = ['Age', 'Vintage', 'Avg_Account_Balance']


In [None]:
def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

scaler = StandardScaler()

onehot_encoded_df = pd.get_dummies(combined[onehot_cols])
label_encoded_df = combined[label_cols].apply(label_encoder)
numerical_df = pd.DataFrame(scaler.fit_transform(combined[numerical_cols]), columns=numerical_cols)
target_df = combined[TARGET]

combined = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)
combined

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
0,1.963311,-0.121384,-0.098541,18,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0.0
1,-0.933890,-0.461633,-0.639654,27,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
2,0.817906,-0.647223,0.413296,18,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0.0
3,-0.664383,-0.863745,-0.769806,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
4,-0.933890,-0.430701,-0.283976,32,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351032,0.548399,1.208681,3.631964,18,1,0,0,1,0,0,0,1,0,1,0,1,0,0,
351033,0.750529,1.208681,0.033855,27,1,0,1,0,0,0,0,1,1,0,0,1,0,0,
351034,-0.597006,-0.987472,0.669333,4,1,0,0,1,0,1,0,0,1,0,0,0,0,1,
351035,0.615775,1.425203,-0.458559,4,1,0,1,0,0,1,0,0,0,1,0,0,1,0,


In [None]:
train_target = pd.DataFrame(train.Is_Lead)
train_target

Unnamed: 0,Is_Lead
0,0
1,0
2,0
3,0
4,0
...,...
245720,0
245721,0
245722,0
245723,0


In [None]:
train_new = combined[:245725]
train_new

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
0,1.963311,-0.121384,-0.098541,18,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0.0
1,-0.933890,-0.461633,-0.639654,27,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
2,0.817906,-0.647223,0.413296,18,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0.0
3,-0.664383,-0.863745,-0.769806,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
4,-0.933890,-0.430701,-0.283976,32,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,0.481022,1.920111,0.928227,34,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0.0
245721,-1.136021,-0.987472,-0.311790,18,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0.0
245722,-1.203397,-1.049336,-0.536181,31,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
245723,-1.068644,-0.492564,-0.843264,23,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0


In [None]:
test_new = combined[245725:]
test_new

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
245725,-1.001267,-0.678155,-0.452505,4,1,0,1,0,0,0,0,1,1,0,1,0,0,0,
245726,-0.057992,0.064207,-0.238757,18,1,0,1,0,0,0,1,0,1,0,0,1,0,0,
245727,-0.866513,-1.018404,-1.066795,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,
245728,-1.001267,-0.430701,-0.305817,22,1,0,1,0,0,1,0,0,1,0,1,0,0,0,
245729,-1.001267,-0.863745,-0.552019,20,0,0,1,0,0,1,0,0,1,0,1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351032,0.548399,1.208681,3.631964,18,1,0,0,1,0,0,0,1,0,1,0,1,0,0,
351033,0.750529,1.208681,0.033855,27,1,0,1,0,0,0,0,1,1,0,0,1,0,0,
351034,-0.597006,-0.987472,0.669333,4,1,0,0,1,0,1,0,0,1,0,0,0,0,1,
351035,0.615775,1.425203,-0.458559,4,1,0,1,0,0,1,0,0,0,1,0,0,1,0,


In [None]:
train_target = train_new.Is_Lead
train_new.drop(columns=['Is_Lead'],inplace=True)
test_new.drop(columns=['Is_Lead'],inplace=True)

In [None]:
train_new_val = train_new.values
train_target_val = train_target.values

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/2b/21/d13081805e1e1afc71f5bb743ece324c8bd576237c51b899ecb38a717502/optuna-2.7.0-py3-none-any.whl (293kB)
[K     |█▏                              | 10kB 18.8MB/s eta 0:00:01[K     |██▎                             | 20kB 24.5MB/s eta 0:00:01[K     |███▍                            | 30kB 30.0MB/s eta 0:00:01[K     |████▌                           | 40kB 20.8MB/s eta 0:00:01[K     |█████▋                          | 51kB 14.7MB/s eta 0:00:01[K     |██████▊                         | 61kB 12.0MB/s eta 0:00:01[K     |███████▉                        | 71kB 13.2MB/s eta 0:00:01[K     |█████████                       | 81kB 12.5MB/s eta 0:00:01[K     |██████████                      | 92kB 11.8MB/s eta 0:00:01[K     |███████████▏                    | 102kB 12.6MB/s eta 0:00:01[K     |████████████▎                   | 112kB 12.6MB/s eta 0:00:01[K     |█████████████▍                  | 122kB 12.6MB

In [None]:
import optuna
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

**These all steps are common for all 8 models, the only difference that will now come is different optuna functions for hyperparameter tuning of different models, difference in train values as I will be adding pseudo labels with different confidence levels - meaning the train set will  vary for different models**

**Pseudo Labelling - Suppose I use a LightGBM model to predict test set probabilities, then i will pick up the probabilities with 99% and 0.01% surety that they are right i.e they are one or turn into leads,then i will change 99% values to one and 0.01% values to zero. Now take these test rows with are predicted labels and add them back in our training set.Then tune the hyperparameters again and predict on our test set**

**Simple Hyperparameter tuning for LightGBM using optuna**

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
def objective(trial, cv_fold_func=np.average):

    # Optuna suggest params
    params = {
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.90),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 0.90),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 10, 500),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-5, 9e-5),
        'num_leaves': trial.suggest_int('num_leaves', 5, 100),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-2, 9e-2),
        'missing': -999,
         
    }
    cgs = lgb.LGBMClassifier(**params)
    
    pipe = Pipeline(steps =[('lgb',cgs)])


    # fit for all folds and return composite AUC score
    aucs = []
    for i, (train_idx, valid_idx) in enumerate(skf.split(
        train_new_val,
        train_target_val,)):
      
      train_data = train_new_val[train_idx, :], train_target_val[train_idx]
      valid_data = train_new_val[valid_idx, :], train_target_val[valid_idx]

      _ = pipe.fit(train_new_val[train_idx, :], train_target_val[train_idx])
      preds = pipe.predict_proba(train_new_val[valid_idx, :])
      auc = roc_auc_score(train_target_val[valid_idx],preds[:,1])
      aucs.append(auc)  
    
    
    print(f'Trial done: AUC value: {aucs}')
    return cv_fold_func(aucs)

In [None]:
%%time

FIT_LGBM = True

n_trials = 60

if FIT_LGBM:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
model = lgb.LGBMClassifier(n_estimators = 912, max_depth = 9, learning_rate = 0.017793345699819028, min_child_samples = 396, subsample = 0.7200339514243949, colsample_bytree = 0.5837039986354365, subsample_freq = 10, max_bin = 156, reg_alpha = 4.91112261107858e-05, num_leaves = 64, reg_lambda = 0.023905735041184064)
lgb_1 = model.fit(train_new, train_target)

In [None]:
prediction = pd.DataFrame()
prediction1 = lgb_1.predict_proba(test_new)
prediction1

array([[0.96207861, 0.03792139],
       [0.13360371, 0.86639629],
       [0.94427332, 0.05572668],
       ...,
       [0.91187107, 0.08812893],
       [0.76427599, 0.23572401],
       [0.95320471, 0.04679529]])

In [None]:
prediction['ID'] = example.ID
prediction['Is_Lead'] = prediction1[:,1]
prediction

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.037921
1,CCMEWNKY,0.866396
2,VK3KGA9M,0.055727
3,TT8RPZVC,0.024655
4,SHQZEYTZ,0.024234
...,...,...
105307,DBENJOYI,0.988240
105308,CWQ72DWS,0.618402
105309,HDESC8GU,0.088129
105310,2PW4SFCA,0.235724


**I am loading my first csv, with full hyperparameter tuning**

In [None]:
first_csv = pd.read_csv('/content/drive/MyDrive/day2_second_submission_AV.csv')

**First LightGBM Model is complete**

**Now we will start pseudo labelling**

In [None]:
test_1 = test_new
test_1['Is_Lead'] = prediction1[:,1]
test_1

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
245725,-1.001267,-0.678155,-0.452505,4,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0.037921
245726,-0.057992,0.064207,-0.238757,18,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0.866396
245727,-0.866513,-1.018404,-1.066795,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.055727
245728,-1.001267,-0.430701,-0.305817,22,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0.024655
245729,-1.001267,-0.863745,-0.552019,20,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0.024234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351032,0.548399,1.208681,3.631964,18,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0.988240
351033,0.750529,1.208681,0.033855,27,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0.618402
351034,-0.597006,-0.987472,0.669333,4,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0.088129
351035,0.615775,1.425203,-0.458559,4,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0.235724


**99% sure predicted labels, converted to 1 and 0**

In [None]:
test_2 = test_1[ (test_1['Is_Lead']<=0.01) | (test_1['Is_Lead']>=0.99) ].copy()
test_2.loc[ test_2['Is_Lead']>=0.5, 'Is_Lead' ] = 1
test_2.loc[ test_2['Is_Lead']<0.5, 'Is_Lead' ] = 0
test_2 

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
245733,-0.057992,1.054022,-0.150427,34,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
245942,1.289543,1.981974,0.665635,33,1,0,0,1,0,0,0,1,1,0,0,0,1,0,1.0
245981,1.222166,1.208681,-0.903657,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1.0
246006,1.289543,1.394271,0.911962,18,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
246054,1.356920,-0.616291,-0.306191,13,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350471,1.289543,1.981974,-0.275688,33,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
350678,0.481022,1.920111,-0.202823,31,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
350715,1.222166,0.806568,-0.287871,18,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1.0
350769,1.289543,1.425203,0.271656,30,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1.0


In [None]:
test_2.Is_Lead.value_counts()

1.0    1244
0.0       2
Name: Is_Lead, dtype: int64

In [None]:
train_new = pd.concat([train_new,train_target],axis=1)
train_new

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
0,1.963311,-0.121384,-0.098541,18,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0.0
1,-0.933890,-0.461633,-0.639654,27,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
2,0.817906,-0.647223,0.413296,18,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0.0
3,-0.664383,-0.863745,-0.769806,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
4,-0.933890,-0.430701,-0.283976,32,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,0.481022,1.920111,0.928227,34,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0.0
245721,-1.136021,-0.987472,-0.311790,18,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0.0
245722,-1.203397,-1.049336,-0.536181,31,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
245723,-1.068644,-0.492564,-0.843264,23,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0


**Joining the 99% sure labels with our training set**

In [None]:
train_new = pd.concat([train_new,test_2])
train_new

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
0,1.963311,-0.121384,-0.098541,18,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0.0
1,-0.933890,-0.461633,-0.639654,27,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
2,0.817906,-0.647223,0.413296,18,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0.0
3,-0.664383,-0.863745,-0.769806,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
4,-0.933890,-0.430701,-0.283976,32,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350471,1.289543,1.981974,-0.275688,33,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
350678,0.481022,1.920111,-0.202823,31,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
350715,1.222166,0.806568,-0.287871,18,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1.0
350769,1.289543,1.425203,0.271656,30,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1.0


**Adding updated target to our train_target**

In [None]:
train_target = train_new.Is_Lead
train_new.drop(columns=['Is_Lead'],inplace=True)
train_target

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
350471    1.0
350678    1.0
350715    1.0
350769    1.0
350985    1.0
Name: Is_Lead, Length: 246971, dtype: float64

**Now on this updated training set,we will run optuna again**

In [None]:
train_new_val = train_new.values
train_target_val = train_target.values

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
def objective(trial, cv_fold_func=np.average):

    # Optuna suggest params
    params = {
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.90),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 0.90),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 10, 500),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-5, 9e-5),
        'num_leaves': trial.suggest_int('num_leaves', 5, 100),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-2, 9e-2),
        'missing': -999,
         
    }
    cgs = lgb.LGBMClassifier(**params)
    
    pipe = Pipeline(steps =[('lgb',cgs)])


    # fit for all folds and return composite AUC score
    aucs = []
    for i, (train_idx, valid_idx) in enumerate(skf.split(
        train_new_val,
        train_target_val,)):
      
      train_data = train_new_val[train_idx, :], train_target_val[train_idx]
      valid_data = train_new_val[valid_idx, :], train_target_val[valid_idx]

      _ = pipe.fit(train_new_val[train_idx, :], train_target_val[train_idx])
      preds = pipe.predict_proba(train_new_val[valid_idx, :])
      auc = roc_auc_score(train_target_val[valid_idx],preds[:,1])
      aucs.append(auc)  
    
    
    print(f'Trial done: AUC value: {aucs}')
    return cv_fold_func(aucs)

In [None]:
%%time

FIT_LGBM = True

n_trials = 60

if FIT_LGBM:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
model = lgb.LGBMClassifier(n_estimators = 382, max_depth = 8, learning_rate = 0.011634084771881957, min_child_samples = 181, subsample = 0.5162644029759852, colsample_bytree = 0.8205143851206681, subsample_freq = 3, max_bin = 221, reg_alpha = 6.264466365286548e-05, num_leaves = 71, reg_lambda = 0.06980622656575862)
lgb_2_pseudo = model.fit(train_new,train_target)

In [None]:
test_new.drop(columns=['Is_Lead'],inplace=True)

In [None]:
prediction = pd.DataFrame()
prediction1 = lgb_2_pseudo.predict_proba(test_new)
prediction1

array([[0.96011824, 0.03988176],
       [0.13885466, 0.86114534],
       [0.9465822 , 0.0534178 ],
       ...,
       [0.90511649, 0.09488351],
       [0.79087199, 0.20912801],
       [0.9466282 , 0.0533718 ]])

**Now i am loading my second csv, the above code is for demonstration purpose as to how i did it**

In [None]:
second_csv = pd.read_csv('/content/drive/MyDrive/day2_third_submission_AV.csv')

In [None]:
""""Now to get third, fourth and fifth csv, just go to the starting of the code,
run the code till first model and when changing the threshold for 90% change it 
to 0.90 and 0.10, get the predictions and then run from starting for 97% and so 
on. 
"""


## third csv is with 90% pseudo predicted labels
third_csv = pd.read_csv('/content/drive/MyDrive/day2_eight_submission_AV.csv') 

## fourth csv is with 97% pseudo predicted labels
fourth_csv = pd.read_csv('/content/drive/MyDrive/day2_ninth_submission_AV.csv')

## fifth csv is with 98% pseudo predicted labels
fifth_csv = pd.read_csv('/content/drive/MyDrive/day2_tenth_submission_AV.csv')


**Now we are carrying forward with 99% lgbm pseudo labels**

**We will use catboost model to predict on our training set with 99% lgbm pseudo labels**

In [None]:
pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 72kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [None]:
import catboost as cb

In [None]:
def objective(trial):

    # Optuna suggest params
    params = {
        "objective": trial.suggest_categorical("objective", ["Logloss"]),

        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli","MVS"]),
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
        'colsample_bylevel' : trial.suggest_uniform('colsample_bylevel',0.0,0.99)
        
         
    }
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    clb = cb.CatBoostClassifier(**params)
    
    pipe = Pipeline(steps =[('cb',clb)])


    # fit for all folds and return composite AUC score
    aucs = []
    X_train, X_test, y_train, y_test = train_test_split(train_new_val, train_target_val, test_size=0.2, shuffle= True)

        
    train_data = X_train, y_train
    valid_data = X_test, y_test
        
    _ = pipe.fit(X_train, y_train)
    preds = pipe.predict_proba(X_test)
    auc = roc_auc_score(y_test, preds[:,1])
    aucs.append(auc)
    
    print(f'Trial done: AUC value: {aucs}')
    return aucs

In [None]:
%%time

FIT_CB = True

n_trials = 60

if FIT_CB:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
model = cb.CatBoostClassifier(objective = 'Logloss', colsample_bylevel = 0.09016403023640955, depth = 11, boosting_type = 'Plain', bootstrap_type = 'MVS', n_estimators = 507, learning_rate = 0.09663333628502627, min_child_samples = 73)
    
cat_1 = model.fit(train_new, train_target)

In [None]:
prediction_cat = pd.DataFrame()
prediction2 = cat_1.predict_proba(test_new)
prediction2

array([[0.95440243, 0.04559757],
       [0.14814927, 0.85185073],
       [0.94754711, 0.05245289],
       ...,
       [0.94531168, 0.05468832],
       [0.78279945, 0.21720055],
       [0.94719806, 0.05280194]])

In [None]:
prediction_cat['ID'] = example.ID
prediction_cat['Is_Lead'] = prediction2[:,1]
prediction_cat

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.045598
1,CCMEWNKY,0.851851
2,VK3KGA9M,0.052453
3,TT8RPZVC,0.023649
4,SHQZEYTZ,0.022406
...,...,...
105307,DBENJOYI,0.977066
105308,CWQ72DWS,0.577261
105309,HDESC8GU,0.054688
105310,2PW4SFCA,0.217201


**Now i am loading my catboost predictions csv**

In [None]:
sixth_csv = pd.read_csv('/content/drive/MyDrive/day2_fifth_submission_AV.csv')

**Sixth Model is complete**

**Now we begin again with from the starting, no pseudo added,we will use 99% pseudo labels created from xgboost and then use xgboost again to make final prediction**

**Before the following code i have gone up again and made sure that we are using the starting train and test values given to us **

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
def objective(trial, cv_fold_func=np.average):

    # Optuna suggest params
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.90),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 0.90),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 10, 500),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-5, 9e-5),
        'num_leaves': trial.suggest_int('num_leaves', 5, 100),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-2, 9e-2),
        'missing': -999,
        'tree_method' : 'gpu_hist'
         
    }
    clx = xgb.XGBClassifier(**params)
    
    pipe = Pipeline(steps =[('xgb',clx)])


    # fit for all folds and return composite AUC score
    aucs = []
    for i, (train_idx, valid_idx) in enumerate(skf.split(
        train_new_val,
        train_target_val,)):
      
      train_data = train_new_val[train_idx, :], train_target_val[train_idx]
      valid_data = train_new_val[valid_idx, :], train_target_val[valid_idx]

      _ = pipe.fit(train_new_val[train_idx, :], train_target_val[train_idx])
      preds = pipe.predict_proba(train_new_val[valid_idx, :])
      auc = roc_auc_score(train_target_val[valid_idx],preds[:,1])
      aucs.append(auc)  
    
    
    print(f'Trial done: AUC value: {aucs}')
    return cv_fold_func(aucs)

In [None]:
%%time

FIT_XGB = True

n_trials = 60

if FIT_XGB:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
model = xgb.XGBClassifier(n_estimators = 913, max_depth = 2, learning_rate = 0.08073138222606017, min_child_samples = 114, subsample = 0.8590479782154431, colsample_bytree = 0.5613215969527616, subsample_freq = 4, max_bin = 319, reg_alpha = 1.8067918905454266e-05, num_leaves = 7, reg_lambda = 0.05639147938730859, gamma = 5)
xgb_1 = model.fit(train_new, train_target)

In [None]:
prediction_xgb = pd.DataFrame()
prediction3 = xgb_1.predict_proba(test_new)
prediction3

array([[0.92085403, 0.07914599],
       [0.18765068, 0.8123493 ],
       [0.93467015, 0.06532985],
       ...,
       [0.9053165 , 0.09468355],
       [0.76143897, 0.23856105],
       [0.9552251 , 0.04477488]], dtype=float32)

In [None]:
prediction_xgb['ID'] = example.ID
prediction_xgb['Is_Lead'] = prediction3[:,1]
prediction_xgb

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.079146
1,CCMEWNKY,0.812349
2,VK3KGA9M,0.065330
3,TT8RPZVC,0.027708
4,SHQZEYTZ,0.027229
...,...,...
105307,DBENJOYI,0.997340
105308,CWQ72DWS,0.553449
105309,HDESC8GU,0.094684
105310,2PW4SFCA,0.238561


In [None]:
test_3 = test_new
test_3['Is_Lead'] = prediction3[:,1]
test_3

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
245725,-1.001267,-0.678155,-0.452505,4,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0.079146
245726,-0.057992,0.064207,-0.238757,18,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0.812349
245727,-0.866513,-1.018404,-1.066795,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.065330
245728,-1.001267,-0.430701,-0.305817,22,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0.027708
245729,-1.001267,-0.863745,-0.552019,20,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0.027229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351032,0.548399,1.208681,3.631964,18,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0.997340
351033,0.750529,1.208681,0.033855,27,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0.553449
351034,-0.597006,-0.987472,0.669333,4,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0.094684
351035,0.615775,1.425203,-0.458559,4,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0.238561


In [None]:
test_4 = test_3[ (test_3['Is_Lead']<=0.01) | (test_3['Is_Lead']>=0.99) ].copy()
test_4.loc[ test_3['Is_Lead']>=0.5, 'Is_Lead' ] = 1
test_4.loc[ test_3['Is_Lead']<0.5, 'Is_Lead' ] = 0
test_4 

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
245733,-0.057992,1.054022,-0.150427,34,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
245942,1.289543,1.981974,0.665635,33,1,0,0,1,0,0,0,1,1,0,0,0,1,0,1.0
245981,1.222166,1.208681,-0.903657,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1.0
246006,1.289543,1.394271,0.911962,18,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0
246032,0.683152,0.497251,-0.818817,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350715,1.222166,0.806568,-0.287871,18,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1.0
350769,1.289543,1.425203,0.271656,30,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1.0
350790,-0.260123,1.363340,-0.282927,4,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1.0
350985,0.076761,2.167565,-0.639622,20,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0


In [None]:
train_new = pd.concat([train_new,train_target],axis=1)
train_new

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
0,1.963311,-0.121384,-0.098541,18,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0.0
1,-0.933890,-0.461633,-0.639654,27,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
2,0.817906,-0.647223,0.413296,18,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0.0
3,-0.664383,-0.863745,-0.769806,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
4,-0.933890,-0.430701,-0.283976,32,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,0.481022,1.920111,0.928227,34,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0.0
245721,-1.136021,-0.987472,-0.311790,18,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0.0
245722,-1.203397,-1.049336,-0.536181,31,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
245723,-1.068644,-0.492564,-0.843264,23,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0


In [None]:
train_new = pd.concat([train_new,test_4])
train_new

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Region_Code,Gender,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Voila,Credit_Product_Yes,Is_Active_No,Is_Active_Yes,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Is_Lead
0,1.963311,-0.121384,-0.098541,18,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0.0
1,-0.933890,-0.461633,-0.639654,27,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
2,0.817906,-0.647223,0.413296,18,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0.0
3,-0.664383,-0.863745,-0.769806,20,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
4,-0.933890,-0.430701,-0.283976,32,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350715,1.222166,0.806568,-0.287871,18,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1.0
350769,1.289543,1.425203,0.271656,30,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1.0
350790,-0.260123,1.363340,-0.282927,4,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1.0
350985,0.076761,2.167565,-0.639622,20,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1.0


In [None]:
train_target = train_new.Is_Lead
train_new.drop(columns=['Is_Lead'],inplace=True)


In [None]:
train_new_val = train_new.values
train_target_val = train_target.values

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
def objective(trial, cv_fold_func=np.average):

    # Optuna suggest params
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.90),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 0.90),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 10, 500),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-5, 9e-5),
        'num_leaves': trial.suggest_int('num_leaves', 5, 100),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-2, 9e-2),
        'missing': -999,
        'tree_method' : 'gpu_hist'
         
    }
    clx = xgb.XGBClassifier(**params)
    
    pipe = Pipeline(steps =[('xgb',clx)])


    # fit for all folds and return composite AUC score
    aucs = []
    for i, (train_idx, valid_idx) in enumerate(skf.split(
        train_new_val,
        train_target_val,)):
      
      train_data = train_new_val[train_idx, :], train_target_val[train_idx]
      valid_data = train_new_val[valid_idx, :], train_target_val[valid_idx]

      _ = pipe.fit(train_new_val[train_idx, :], train_target_val[train_idx])
      preds = pipe.predict_proba(train_new_val[valid_idx, :])
      auc = roc_auc_score(train_target_val[valid_idx],preds[:,1])
      aucs.append(auc)  
    
    
    print(f'Trial done: AUC value: {aucs}')
    return cv_fold_func(aucs)

In [None]:
%%time

FIT_XGB = True

n_trials = 60

if FIT_XGB:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
model = xgb.XGBClassifier(n_estimators = 906, max_depth = 6, learning_rate = 0.0319620937223056, min_child_samples = 88, subsample = 0.6153457680634538, colsample_bytree = 0.7827029344478769, subsample_freq = 9, max_bin = 381, reg_alpha = 1.772254304318213e-05, num_leaves = 11, reg_lambda = 0.07128159809882081)
xg_final = model.fit(train_new, train_target)


In [None]:
test_new.drop(columns=['Is_Lead'],inplace=True)

In [None]:
prediction_xgb = pd.DataFrame()
prediction3 = xg_final.predict_proba(test_new)
prediction3

array([[0.9624088 , 0.03759122],
       [0.12831438, 0.8716856 ],
       [0.91619694, 0.08380306],
       ...,
       [0.9475452 , 0.05245483],
       [0.78868496, 0.211315  ],
       [0.9485549 , 0.05144515]], dtype=float32)

In [None]:
prediction_xgb['ID'] = example.ID
prediction_xgb['Is_Lead'] = prediction3[:,1]
prediction_xgb

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.037591
1,CCMEWNKY,0.871686
2,VK3KGA9M,0.083803
3,TT8RPZVC,0.021892
4,SHQZEYTZ,0.025893
...,...,...
105307,DBENJOYI,0.987069
105308,CWQ72DWS,0.583463
105309,HDESC8GU,0.052455
105310,2PW4SFCA,0.211315


In [None]:
seventh_csv = pd.read_csv('/content/drive/MyDrive/day2_sixth_submission_AV.csv')

**Seventh Model is complete**

**In the final model, i start from the beginning, take normal train and test values, tune it and predict using LGBM, then i take all the predicted values, meaning not a subset like the previous models where i am taking 99% sure values, here i concatenated the train set, all the test set and the labels i predicted using lgbm.Finally on this full concatenated set i run a XGBoost model again and make the predictions on the test set.**

In [None]:
eight_csv = pd.read_csv('/content/drive/MyDrive/day3_first_submission_AV.csv')

In [None]:
submission = pd.DataFrame()
submission['submit_lgb_1'] = first_csv['Is_Lead']
submission['submit_lgb_2'] = second_csv['Is_Lead']
submission['submit_lgb_3'] = third_csv['Is_Lead']
submission['submit_lgb_4'] = fourth_csv['Is_Lead']
submission['submit_lgb_5'] = fifth_csv['Is_Lead']
submission['submit_lgb_8'] = eight_csv['Is_Lead']
submission['submit_cat'] = sixth_csv['Is_Lead']
submission['submit_xgb'] = seventh_csv['Is_Lead']
submission

Unnamed: 0,submit_lgb_1,submit_lgb_2,submit_lgb_3,submit_lgb_4,submit_lgb_5,submit_lgb_8,submit_cat,submit_xgb
0,0.043559,0.038008,0.031652,0.052011,0.056972,0.031928,0.046840,0.037591
1,0.854090,0.865280,0.839551,0.852786,0.868912,0.906287,0.840826,0.871686
2,0.060806,0.063410,0.038444,0.056963,0.066624,0.043807,0.055677,0.083803
3,0.024824,0.023051,0.018876,0.019270,0.021967,0.017179,0.023064,0.021892
4,0.023703,0.022842,0.018981,0.018802,0.022085,0.016639,0.020662,0.025893
...,...,...,...,...,...,...,...,...
105307,0.991241,0.991177,0.970158,0.991351,0.992547,0.982457,0.977447,0.987069
105308,0.585155,0.583763,0.544960,0.566964,0.567311,0.596477,0.566113,0.583463
105309,0.077105,0.067359,0.064429,0.088323,0.071828,0.047425,0.075926,0.052455
105310,0.211731,0.214975,0.221025,0.218562,0.229202,0.156578,0.194503,0.211315


In [None]:
submission['ID'] = example.ID
submission['Is_Lead'] = submission.mean(axis = 1)
submission.drop([col for col in submission.columns if col.startswith('submit_')], axis=1, inplace=True)
submission

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.042320
1,CCMEWNKY,0.862427
2,VK3KGA9M,0.058692
3,TT8RPZVC,0.021265
4,SHQZEYTZ,0.021201
...,...,...
105307,DBENJOYI,0.985431
105308,CWQ72DWS,0.574276
105309,HDESC8GU,0.068106
105310,2PW4SFCA,0.207236


**Now I am loading my unmarked submission**

In [None]:
final_submission = pd.read_csv('/content/drive/MyDrive/day3_third_submission_AV.csv')
final_submission

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.042320
1,CCMEWNKY,0.862427
2,VK3KGA9M,0.058692
3,TT8RPZVC,0.021265
4,SHQZEYTZ,0.021201
...,...,...
105307,DBENJOYI,0.985431
105308,CWQ72DWS,0.574276
105309,HDESC8GU,0.068106
105310,2PW4SFCA,0.207236
