In [None]:
## data wrangling tools
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import os

#stats for ensembling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

#Preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing

#Ensembling
from scipy import stats as st

In [None]:
#Category encoder
def cat_encoder(X_train, X_test, cat_cols):
    encoder = OrdinalEncoder()
    train_encoder = encoder.fit_transform(X_train[cat_cols]).astype(int)
    test_encoder = encoder.transform(X_test[cat_cols]).astype(int)
    for col in cat_cols:
        X_train[col] = train_encoder[:, cat_cols.index(col)]
        X_test[col] = test_encoder[:, cat_cols.index(col)]
    encoder_cols = cat_cols
    return X_train, X_test, encoder_cols

In [None]:
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.feature_selection import VarianceThreshold

#Implementingthe Transformer class
class low_var(TransformerMixin):
    def __init__(self,threshold=0.3):
        self.threshold=threshold
    def fit(self,X,y=None):
        col_vars=X.var()
        self.col_to_drop=col_vars[col_vars<self.threshold].index
        return self
    def transform(self,X):
        assert self.col_to_drop is not None, 'Drop_col error, must be fitted before predict'
        X.drop(self.col_to_drop, axis=1, inplace=True)
        return X

In [None]:
#Reading the dataset
FILEPATH="/kaggle/input/analytic-olympiad"
df_train=pd.read_csv(os.path.join(FILEPATH,'train.csv'))
df_test=pd.read_csv(os.path.join(FILEPATH,'test.csv'))
df_train=df_train.fillna(0)
df_test=df_test.fillna(0)


#Defining the features
target_col01='primary_close_flag'
target_col02='final_close_flag'
cat_cols=df_train.select_dtypes(include="object").columns.to_list()
num_cols=df_train.select_dtypes(include=["int64","float64"]).columns.to_list()[:-2]

#Definging the training and testing dataset
X = df_train.drop([f'{target_col01}',f'{target_col02}'],axis=1).reset_index(drop=True)
y = df_train[f'{target_col01}'].reset_index(drop=True)
X_=df_test.copy()

In [None]:
# Drop_col
encoded_features = ['encoded_payment_' + str(i) for i in range(25)]
# for col in encoded_features:
#     X.drop([col], axis=1, inplace=True)
#     X_.drop([col],axis =1, inplace =True)
drop_cols = ['customer_id', 'firstname', 'lastname']
X.drop(drop_cols, axis=1, inplace=True)
X_.drop(drop_cols, axis=1, inplace=True)

#Reassigning cat_cols
cat_cols=X.select_dtypes(include="object").columns.to_list()
num_cols=X.select_dtypes(include=["int64","float64"]).columns.to_list()[:-2]

#Removing columns with low variance
# var_col=low_var(1)
# X=pd.concat([var_col.fit_transform(X[num_cols]),X[cat_cols]],axis=1)
# X_=pd.concat([var_col.transform(X_[num_cols]),X_[cat_cols]],axis=1)

# category_encoders
X, X_, cat_cols = cat_encoder(X, X_, cat_cols)

In [None]:
scale=preprocessing.StandardScaler()
X=pd.DataFrame(scale.fit_transform(X),index=X.index,columns=X.columns)
X_=pd.DataFrame(scale.transform(X_),index=X_.index,columns=X_.columns)

In [None]:
from tensorflow import keras
from tensorflow.keras import regularizers
import tensorflow as tf

from tensorflow import keras
early_stopping = keras.callbacks.EarlyStopping(
    patience=15,
    monitor='val_binary_crossentropy',
    min_delta=0.001,
    restore_best_weights=True,
)

#It is better to reduce the learning rate as we do training.
lr_schedule = keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=X.shape[0]*1,
  decay_rate=1,
  staircase=False)

def tf_model():
    model_tf=tf.keras.Sequential([
        keras.layers.Input(shape=[44,]),
        keras.layers.Dense(256, activation='relu',kernel_regularizer=regularizers.l2(0.003)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.003),),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001),),
        keras.layers.Dense(8, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(2, activation='sigmoid')])
        
    model_tf.compile(optimizer= keras.optimizers.Adam(lr_schedule),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model_tf

In [None]:
####################### XGB CLASSSIFIER #######################
from xgboost import XGBClassifier
def xgb_model():
    xgb_01={
         'n_estimators': 120,
         'learning_rate': 0.5619556985982561,
         'max_depth': 136,
         'min_child_weight': 10,
         'reg_alpha': 0.4632934146772244,
         'reg_lambda': 13,
         'gamma': 0.7471461478419492,
         'colsample_bytree': 0.8939543526804394,
         'colsample_bylevel': 0.33219308427147426 
    }
    
    
    xgb_02={
         'n_estimators': 320,
         'learning_rate': 0.6771864073202802,
         'max_depth': 179,
         'min_child_weight': 14,
         'reg_alpha': 0.6456816599087696,
         'reg_lambda': 10,
         'gamma': 1.264355703745565,
         'colsample_bytree': 0.6432920257822892,
         'colsample_bylevel': 0.4547371625752076
    }
    
   # return XGBClassifier(**xgb_01)
    return XGBClassifier(**xgb_01)
################## LGBM CLASSIFIER ############################
from lightgbm import LGBMClassifier
def lgbm_model():
    lgbm_01={
         'n_estimators': 229,
         'learning_rate': 0.5185147161031304,
         'max_depth': 70,
         'min_child_weight': 10,
         'reg_alpha': 0.699472780990506,
         'reg_lambda': 7,
         'colsample_bytree': 0.3072359964466818
    }
    lgbm_02={
         'n_estimators': 327,
         'learning_rate': 0.7862270379341548,
         'max_depth': 199,
         'min_child_weight': 13,
         'reg_alpha': 0.8800992852019061,
         'reg_lambda': 6,
         'colsample_bytree': 0.8218273150299635
    }
    return LGBMClassifier(**lgbm_01)
#     return LGBMClassifier(**lgbm_01)
    
####################### CATBOOST CLASSIFIER ###############
from catboost import CatBoostClassifier 
def cat_model():
    cat_01={}
    #return CatBoostClassifier(**cat_01)
    return CatBoostClassifier(**cat_01)

############## TPOT Classifier ###############
from tpot import TPOTClassifier
def tpot_model():
    tpot_01={
        'generations':2, 
        'population_size':15,
        'scoring':'accuracy'
    }
    return TPOTClassifier(**tpot_01)

In [None]:
# eval_set=[(X_val, y_val)]
n_split=3
random_state=42
preds=pd.DataFrame()
kf = StratifiedKFold(n_splits=n_split, random_state=random_state, shuffle=True)
for i,(train_index, val_index) in enumerate(kf.split(X,y)):
    models={}
    class_probs={}
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    
    ###########Deep Neural_MODEL###################
    y_train, y_val = [tf.keras.utils.to_categorical(y.iloc[index]) for index in [train_index, val_index]]
    model=tf_model()
   # model.fit(X_train,y_train,validation_data=[X_val,y_val],epochs=1,
                    # callbacks=[early_stopping],batch_size=200)
   # models['tf_model']=model
    
    ######Non Neural Neural Networks#######################
    y_train, y_val = y.iloc[train_index],y.iloc[val_index]
    ###XGB classifier ########
    model=xgb_model()
    model.fit(X_train, y_train)
    models['xgb_model']=model
    ####LGBM classifier########
    model=lgbm_model()
    model.fit(X_train, y_train)
    models['lgbm_model']=model
    ####CAT classifier########
    model=cat_model()
    model.fit(X_train, y_train)
    models['cat_model']=model
    ####Tpot classifier #######
#     model=tpot_model()
#     model.fit(X_train, y_train)
#     models['tpot_model']=model
    
    
    #####Ensemble Models#######################
    for model_name, model in models.items():
        if model_name =='tf_model':
             probs = model.predict(X_)
        else :
            probs=model.predict_proba(X_)
        class_probs[model_name] = probs
    ensemble_probs = np.mean(list(class_probs.values()), axis=0)
    probs= np.argmax(ensemble_probs, axis=1)
    preds.insert(loc=0, column=f'fold_{i+1}', value=probs)
    
    print(f'############## FOLD{i+1}########################')

In [None]:
from scipy import stats as st
preds['mode']=preds.apply(lambda x:st.mode(x)[0],axis=1)

In [None]:
result=pd.read_csv('/kaggle/input/trivial-base/submission_trivial_base.csv')
result[f'{target_col01}']=preds['mode']
result.to_csv('big_mommy.csv',index=False)