<a href="https://www.kaggle.com/code/vaishakhraveendran/horsey-new-ensemble?scriptVersionId=144697846" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
from sklearn.decomposition import PCA
import pandas as pd

class ReduceDim:
    def __init__(self, n):
        self.n = n
        self.pca = PCA(n_components=n)

    def fit_transform(self, M):
        M_ = self.pca.fit_transform(M)
        M = pd.concat([M, pd.DataFrame(M_, columns=[f'PCA{i+1}' for i in range(self.n)])], axis=1)
        return M

    def transform(self, N):
        N_ = self.pca.transform(N)
        N = pd.concat([N, pd.DataFrame(N_, columns=[f'PCA{i+1}' for i in range(self.n)])], axis=1)
        return N

In [None]:
#Model Imports
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing

In [None]:
#Correlation plots
def plot_heatmap(df, title):
    # Create a mask for the diagonal elements
    df=df.drop([target_col],axis=1)
    df=df[num_cols]
    mask = np.zeros_like(df.astype(float).corr())
    mask[np.triu_indices_from(mask)] = True

    # Set the colormap and figure size
    colormap = plt.cm.RdBu_r
    plt.figure(figsize=(16, 16))

    # Set the title and font properties
    plt.title(f'{title} Correlation of Features', fontweight='bold', y=1.02, size=20)

    # Plot the heatmap with the masked diagonal elements
    sns.heatmap(df.astype(float).corr(), linewidths=0.1, vmax=1.0, vmin=-1.0, 
                square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 14, "weight": "bold"},
                mask=mask)

In [None]:
def df_Impute(df):
    df['pain'] = df['pain'].fillna('depressed')
    df['abdomo_protein'] = df['abdomo_protein'].fillna(3.5)
    df['rectal_exam_feces'] = df['rectal_exam_feces'].fillna('absent')
    df['abdomen'] = df['abdomen'].fillna('distend_small')
    df['packed_cell_volume'] = df['packed_cell_volume'].fillna(49)
    df['total_protein'] = df['total_protein'].fillna(7.5)
    df['peristalsis'] = df['peristalsis'].fillna('hypomotile')
    df['abdominal_distention'] = df['abdominal_distention'].fillna('none')
    df['nasogastric_tube'] = df['nasogastric_tube'].fillna('none')
    df['nasogastric_reflux'] = df['nasogastric_reflux'].fillna('none')
    df['nasogastric_reflux_ph'] = df['nasogastric_reflux_ph'].fillna(4.3)
    df['rectal_temp'] = df['rectal_temp'].fillna(38.0)
    df['pulse'] = df['pulse'].fillna(78)
    df['respiratory_rate'] = df['respiratory_rate'].fillna(30)
    df['temp_of_extremities'] = df['temp_of_extremities'].fillna('normal')
    df['peripheral_pulse'] = df['peripheral_pulse'].fillna('normal')
    df['mucous_membrane'] = df['mucous_membrane'].fillna('pale_pink')
    df['capillary_refill_time'] = df['capillary_refill_time'].fillna('3')
    return df


In [None]:
def feature_eng(df_):
    df_['rectal_temp']=df_['rectal_temp'].apply(lambda x:abs(x-37.8))
    df_['log_pulse']=np.log1p(df_['pulse'])
    df_['sqrt_total_protein']=np.sqrt(df_['total_protein'])
    df_['log_respiratory_rate']=np.log1p(df_['respiratory_rate'])
    df_['log_lesion_1']=np.log1p(df_['lesion_1'])
    df_['sqrt_abdomo_protein']=np.sqrt(df_['abdomo_protein'])
    df_['log_nasogastric_reflux_ph']=np.log1p(X['nasogastric_reflux_ph'])
    # Replace specific values for certain columns
    replace_values = {
        "pain": {'slight': 'moderate'},
        "peristalsis": {'distend_small': 'normal'},
        "rectal_exam_feces": {'serosanguious': 'absent'},
        "nasogastric_reflux": {'slight': 'none'}
    }
    df_.replace(replace_values, inplace=True)
    return df_

In [None]:
#warning suppress
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
#Ensembling
from scipy import stats as st
def fold(fold_list,K):
    stack=np.column_stack((fold_list[i] for i in range(K)))
    mode=st.mode(stack,axis=1)
    return mode[0]

In [None]:
#Category encoder
from category_encoders import OrdinalEncoder
def cat_encoder(X_train, X_test, cat_cols):
        encoder = OrdinalEncoder(cols=cat_cols, handle_missing='ignore')
        train_encoder = encoder.fit_transform(X_train[cat_cols]).astype(int)
        test_encoder = encoder.transform(X_test[cat_cols]).astype(int)
        X_train[cat_cols] = train_encoder[cat_cols]
        X_test[cat_cols] = test_encoder[cat_cols]
        encoder_cols = cat_cols
        return X_train, X_test, encoder_cols

In [None]:
def plot_map(df_train,catCols,n_cols=2):
    n_rows=(len(catCols))//n_cols
    fig, axes = plt.subplots(n_rows,n_cols,figsize=(18,6*n_rows))
    ax=axes.flatten()
    for i,col in enumerate(catCols):
        if col !=target_col:
            sns.heatmap(data = pd.crosstab(df_train[col], df_train[target_col]),
                    annot = True, fmt = '.0f', ax = ax[i])
            ax[i].set_title(f'{col} Distribution (Train)')

In [None]:
def plot_hist(df_train,df_test,numCols,n_cols=2):
    n_rows=(len(numCols)-1)//n_cols
    fig, axes = plt.subplots(n_rows,n_cols,figsize=(18,6*n_rows))
    ax=axes.flatten()
    numCols.remove('hospital_number')
    for i,col in enumerate(numCols):
            sns.histplot(df_train[col],ax=ax[i],kde=True)
            sns.histplot(df_test[col],ax=ax[i],kde=True)
            ax[i].set_title(f'{col} Distribution (Train v/s Test)')
            

In [None]:
FILEPATH ='/kaggle/input/playground-series-s3e22'
#Reading the train and test sets and joining
df_train=pd.read_csv(os.path.join(FILEPATH,'train.csv')).set_index('id')
df_test=pd.read_csv(os.path.join(FILEPATH,'test.csv')).set_index('id')
df_original=pd.read_csv('/kaggle/input/horse-survival-dataset/horse.csv')
indexx=np.arange(df_train.index[-1]+1, df_train.index[-1] + df_original.shape[0]+1)
df_original.index=indexx
df_train=pd.concat([df_train,df_original],axis=0).sample(frac = 1)


#Defining categorical and numerical columns
df_train['hospital_number']=df_train['hospital_number'].astype('object')
df_test['hospital_number']=df_test['hospital_number'].astype('object')
cat_cols=df_train.select_dtypes(include='object').columns.to_list()[:-1]
num_cols=df_train.select_dtypes(include=['int64','float64']).columns.to_list()
target_col='outcome'

#checking null columns
#df_train.isna().sum(),df_test.isna().sum()
#Columns with null values
null_columns=[ 'pain','abdomo_protein','rectal_exam_feces','abdomen','packed_cell_volume',
              'total_protein','peristalsis' ,'abdominal_distention','nasogastric_tube',
              'nasogastric_reflux','nasogastric_reflux_ph','rectal_temp','pulse','respiratory_rate',
              'temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time']

In [None]:
#Data preprocessing
X= df_train.drop([f'{target_col}'],axis=1).reset_index(drop=True)
y = df_train[[f'{target_col}']].reset_index(drop=True)
X_=df_test.copy().reset_index(drop=True)

#imputation
# X_train=df_Impute(X_train)
# X_test=df_Impute(X_test)

#category_encoding
X, X_, cat_cols = cat_encoder(X, X_, cat_cols)
y['outcome']= y['outcome'].map({'died':0,'euthanized':1,'lived':2})

#Drop columns
drop_cols = ['lesion_3','lesion_2','hospital_number']
X.drop(drop_cols, axis=1, inplace=True)
X_.drop(drop_cols, axis=1, inplace=True)

#Filling the null values and standard scaling is essential for tensorflow.
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)
X=pd.DataFrame(imputer.fit_transform(X),index=X.index,columns=X.columns)
X_=pd.DataFrame(imputer.transform(X_),index=X_.index,columns=X_.columns)

In [None]:
#Feature engineering
X=feature_eng(X)
X_=feature_eng(X_)

#Dimensionality reduction
dim=ReduceDim(2)
X=dim.fit_transform(X)
X_=dim.transform(X_)

#visualise dimensionality reduction
M=pd.concat([X,y],axis=1)
sns.scatterplot(data=M, x="PCA1", y="PCA2", hue=f"{target_col}") 

#feature scaling
scale=preprocessing.StandardScaler()
X=pd.DataFrame(scale.fit_transform(X),index=X.index,columns=X.columns)
X_=pd.DataFrame(scale.transform(X_),index=X_.index,columns=X_.columns)

In [None]:
# transform the dataset to be more balanced.
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import regularizers

from tensorflow import keras
early_stopping = keras.callbacks.EarlyStopping(
    patience=15,
    min_delta=0.001,
    restore_best_weights=True,
)

#It is better to reduce the learning rate as we do training.
lr_schedule = keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=X.shape[0]*20,
  decay_rate=1,
  staircase=False)

def tf_model():
    model_tf=tf.keras.Sequential([
        keras.layers.Input(shape=[32,]),
        keras.layers.Dense(256, activation='relu',kernel_regularizer=regularizers.l2(0.003)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.003),),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001),),
        keras.layers.Dense(8, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(3, activation='softmax')])
        
    model_tf.compile(optimizer= keras.optimizers.Adam(lr_schedule),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model_tf
    

In [None]:
####################### XGB CLASSSIFIER #######################
from xgboost import XGBClassifier
def xgb_model():
    xgb_params = {
        'n_estimators': 5000,
        'n_jobs': -1,
        'max_depth': 5,
        'eta': 0.2,
        'colsample_bytree': 0.8,
        'objective': 'multi:softprob',
        'num_class': 3,
        'alpha': 8e-07,
        'lambda': 0.0012,
        'early_stopping_rounds':10,
        'verbose':0
        }
    return XGBClassifier(**xgb_params)
######################## Naive Bayes ##############
from sklearn.naive_bayes import MultinomialNB
def naive_model():
    naive_params={
        
    }
    return MultinomialNB(**naive_params)
################## LDA #############################
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
def lad_model():
    return LinearDiscriminantAnalysis()
 
################## LGBM CLASSIFIER ############################
from lightgbm import LGBMClassifier
def lgbm_model():
    lgb_params= {
            'objective': 'multiclass',
            'metric': 'auc_mu', 
            'num_class': 3,
            'feature_pre_filter': False, 
            'lambda_l1': 1.0309196948445505e-05,
            'lambda_l2': 0.034674458424846455, 
            'num_leaves': 52,
            'feature_fraction': 0.8480000000000001,
            'bagging_fraction': 0.9622932514137276, 
            'bagging_freq': 6,
            'verbose':0,
            'min_child_samples': 20, 
            'num_iterations': 50,
            'early_stopping_round': None
         
        }
    return LGBMClassifier(**lgb_params)
    
####################### CATBOOST CLASSIFIER ###############
from catboost import CatBoostClassifier 
def cat_model():
    cat_params = {
        'depth': 6,
        'learning_rate': 0.05,
        'l2_leaf_reg': 0.7,
        'random_strength': 0.2,
        'max_bin': 200,
        'od_wait': 65,
        'one_hot_max_size': 70,
        'grow_policy': 'Depthwise',
        'bootstrap_type': 'Bayesian',
        'od_type': 'Iter',
        'eval_metric': 'TotalF1',
        'loss_function': 'MultiClass',
        'verbose':0
        }
    return CatBoostClassifier(**cat_params)


############## RANDOM FOREST ########
from sklearn.ensemble import RandomForestClassifier
def forest_model():
    return RandomForestClassifier()

############## HIST CLASSIFIER
from sklearn.ensemble import HistGradientBoostingClassifier
def hist_model():
    hist_params = {
            'l2_regularization': 0.1,
            'early_stopping': True,
            'learning_rate': 0.1,
            'max_iter': 80,
            'max_depth': 4,
            'scoring':'f1_micro',
            'max_bins': 255,
            'min_samples_leaf': 10,
            'max_leaf_nodes':21,
            'class_weight':'balanced',
            'random_state': 42,
            'verbose':0
        }
    return HistGradientBoostingClassifier(**hist_params)
########## ADA BOOST CLASSIFIER #############
from sklearn.ensemble import AdaBoostClassifier
def ada_model():
    return AdaBoostClassifier()

In [None]:
#Checking for optimal transformation.
# t = np.sqrt(X['lesion_1'])
# sns.histplot(x=t, kde=True)

In [None]:
#Visualising numerical columns
fig,axs=plt.subplots(2, 2, figsize=(10, 2*5))
ax=axs.flatten()
for i,col in enumerate(['total_protein','nasogastric_reflux_ph','pulse','lesion_1']):
    sns.histplot(x=X[col],kde=True,ax=ax[i])
plt.tight_layout

In [None]:
prob_index=[12, 13, 25, 33, 40, 45, 58, 61, 62, 66, 77, 87, 88, 95, 99, 111, 128, 132, 138, 152, 158, 163, 166, 168, 170, 172, 174, 175, 177, 185, 210, 239, 240, 246, 262, 263, 269, 276, 283, 304, 318, 324, 326, 351, 361, 370, 379, 394, 400, 402, 405, 406, 414, 427, 430, 444, 450, 453, 456, 493, 496, 501, 502, 505, 515, 517, 527, 532, 539, 544, 547, 548, 551, 553, 582, 585, 592, 598, 601, 603, 604, 630, 641, 662, 663, 692, 698, 701, 705, 711, 714, 717, 722, 739, 748, 750, 754, 763, 767, 771, 778, 789, 796, 811]

In [None]:
# eval_set=[(X_val, y_val)]
n_split=10
random_state=42
preds=pd.DataFrame()
kf = StratifiedKFold(n_splits=n_split, random_state=random_state, shuffle=True)
for i,(train_index, val_index) in enumerate(kf.split(X,y)):
    models={}
    class_probs={}
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    
    ###########Deep Neural_MODEL###################
#     y_train, y_val = [tf.keras.utils.to_categorical(y.iloc[index]) for index in [train_index, val_index]]
#     model=tf_model()
#     model.fit(X_train,y_train,validation_data=[X_val,y_val],epochs=300,
#                     callbacks=[early_stopping],batch_size=200)
#     models['tf_model']=model
    
    ######Non Neural Neural Networks#######################
    y_train, y_val = y.iloc[train_index],y.iloc[val_index]
    ###XGB classifier ########
    model=xgb_model()
    model.fit(X_train, y_train,eval_set=[(X_val,y_val)])
    models['xgb_model']=model
    ####LGBM classifier########
    model=lgbm_model()
    model.fit(X_train, y_train,eval_set=[(X_val,y_val)])
    models['lgbm_model']=model
    ####CAT classifier########
    model=cat_model()
    model.fit(X_train, y_train,eval_set=[(X_val,y_val)])
    models['cat_model']=model
    ####### RANDOM FOREST ####
    model=forest_model()
    model.fit(X_train,y_train)
    models['forest_model']=model
    ######### HIST GRAD#######
    model=hist_model()
    model.fit(X_train,y_train)
    models['hist_model']=model
    ######### ADA M0DEL #######
#     model=ada_model()
#     model.fit(X_train,y_train)
#     models['ada_model']=model
    
    #####Ensemble Models#######################
    for model_name, model in models.items():
        if model_name =='tf_model':
             probs = model.predict(X_)
        else :
            probs=model.predict_proba(X_)
        class_probs[model_name] = probs
        #print(f'{model_name}: {f1_score(y_val, model.predict(X_val), average="micro")}')
    ensemble_probs = np.mean(list(class_probs.values()), axis=0)
    print([ensemble_probs[i] for i in prob_index])
    probs= np.argmax(ensemble_probs, axis=1)
    preds.insert(loc=0, column=f'fold_{i+1}', value=probs)
    
    print(f'############## FOLD{i+1}########################')

In [None]:
from scipy import stats as st
preds['mode']=preds.apply(lambda x:st.mode(x)[0],axis=1)

In [None]:
import matplotlib.pyplot as plt
fig, axs=plt.subplots(2,2,figsize=(12, 5*2))
ax=axs.flatten()
for i,model in enumerate(['xgb_model','cat_model','lgbm_model','forest_model']):
    feat_imp = pd.Series(models[model].feature_importances_, index=X.columns)
    ax[i].set_title(f'Feature_Importance_{model}')
    feat_imp.nlargest(10).plot(kind='barh',ax=ax[i])
    plt.xticks(rotation=45)
    
plt.tight_layout()

In [None]:
df_submission=pd.read_csv(os.path.join(FILEPATH,'sample_submission.csv')).set_index('id')
df_submission['outcome']=np.array(preds['mode'])
df_submission['outcome']=df_submission['outcome'].map({0:'died',1:'euthanized',2:'lived'})


In [None]:
df_submission.to_csv('submissiony_double_kill_monsters.csv')