<a href="https://www.kaggle.com/code/vaishakhraveendran/eda-and-base-season-3-episode-22?scriptVersionId=143086649" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
#Model Imports
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing

In [None]:
#Correlation plots
def plot_heatmap(df, title):
    # Create a mask for the diagonal elements
    df=df.drop([target_col],axis=1)
    df=df[num_cols]
    mask = np.zeros_like(df.astype(float).corr())
    mask[np.triu_indices_from(mask)] = True

    # Set the colormap and figure size
    colormap = plt.cm.RdBu_r
    plt.figure(figsize=(16, 16))

    # Set the title and font properties
    plt.title(f'{title} Correlation of Features', fontweight='bold', y=1.02, size=20)

    # Plot the heatmap with the masked diagonal elements
    sns.heatmap(df.astype(float).corr(), linewidths=0.1, vmax=1.0, vmin=-1.0, 
                square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 14, "weight": "bold"},
                mask=mask)

In [None]:
def df_Impute(df):
    df['pain'] = df['pain'].fillna('depressed')
    df['abdomo_protein'] = df['abdomo_protein'].fillna(3.5)
    df['rectal_exam_feces'] = df['rectal_exam_feces'].fillna('absent')
    df['abdomen'] = df['abdomen'].fillna('distend_small')
    df['packed_cell_volume'] = df['packed_cell_volume'].fillna(49)
    df['total_protein'] = df['total_protein'].fillna(7.5)
    df['peristalsis'] = df['peristalsis'].fillna('hypomotile')
    df['abdominal_distention'] = df['abdominal_distention'].fillna('moderate')
    df['nasogastric_tube'] = df['nasogastric_tube'].fillna('slight')
    df['nasogastric_reflux'] = df['nasogastric_reflux'].fillna('none')
    df['nasogastric_reflux_ph'] = df['nasogastric_reflux_ph'].fillna(4.3)
    df['rectal_temp'] = df['rectal_temp'].fillna(38.0)
    df['pulse'] = df['pulse'].fillna(78)
    df['respiratory_rate'] = df['respiratory_rate'].fillna(30)
    df['temp_of_extremities'] = df['temp_of_extremities'].fillna('cool')
    df['peripheral_pulse'] = df['peripheral_pulse'].fillna('normal')
    df['mucous_membrane'] = df['mucous_membrane'].fillna('pale_pink')
    df['capillary_refill_time'] = df['capillary_refill_time'].fillna('less_3_sec')
    return df


In [None]:
def feature_eng(df_):
    df_['rectal_temp']=df_['rectal_temp'].apply(lambda x:abs(x-37.8))
    df_['log_pulse']=np.log1p(df_['pulse'])
    df_['log_respiratory_rate']=np.log1p(df_['respiratory_rate'])
    df_['log_lesion_1']=np.log1p(df_['lesion_1'])
    df_['sqrt_abdomo_protein']=np.sqrt(df_['abdomo_protein'])
    return df_

In [None]:
#warning suppress
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
#Ensembling
from scipy import stats as st
def fold(fold_list,K):
    stack=np.column_stack((fold_list[i] for i in range(K)))
    mode=st.mode(stack,axis=1)
    return mode[0]

In [None]:
class Splitter:
    def __init__(self, kfold=True, n_splits=5, cat_df=pd.DataFrame()):
        self.n_splits = n_splits
        self.kfold = 'skf'
        self.cat_df = cat_df

    def split_data(self, X, y, random_state_list):
        if self.kfold == 'skf':
            for random_state in random_state_list:
                kf = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        elif self.kfold:
            for random_state in random_state_list:
                kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        else:
            raise ValueError(f"Invalid kfold: Must be True")

In [None]:
#Category encoder
from category_encoders import OrdinalEncoder
def cat_encoder(X_train, X_test, cat_cols):
        encoder = OrdinalEncoder(cols=cat_cols, handle_missing='ignore')
        train_encoder = encoder.fit_transform(X_train[cat_cols]).astype(int)
        test_encoder = encoder.transform(X_test[cat_cols]).astype(int)
        X_train[cat_cols] = train_encoder[cat_cols]
        X_test[cat_cols] = test_encoder[cat_cols]
        encoder_cols = cat_cols
        return X_train, X_test, encoder_cols

In [None]:
def plot_map(df_train,catCols,n_cols=2):
    n_rows=(len(catCols))//n_cols
    fig, axes = plt.subplots(n_rows,n_cols,figsize=(18,6*n_rows))
    ax=axes.flatten()
    for i,col in enumerate(catCols):
        if col !=target_col:
            sns.heatmap(data = pd.crosstab(df_train[col], df_train[target_col]),
                    annot = True, fmt = '.0f', ax = ax[i])
            ax[i].set_title(f'{col} Distribution (Train)')

In [None]:
def plot_hist(df_train,df_test,numCols,n_cols=2):
    n_rows=(len(numCols)-1)//n_cols
    fig, axes = plt.subplots(n_rows,n_cols,figsize=(18,6*n_rows))
    ax=axes.flatten()
    numCols.remove('hospital_number')
    for i,col in enumerate(numCols):
            sns.histplot(df_train[col],ax=ax[i],kde=True)
            sns.histplot(df_test[col],ax=ax[i],kde=True)
            ax[i].set_title(f'{col} Distribution (Train v/s Test)')
            

In [None]:
FILEPATH ='/kaggle/input/playground-series-s3e22'
#Reading the train and test sets and joining
df_train=pd.read_csv(os.path.join(FILEPATH,'train.csv')).set_index('id')
df_test=pd.read_csv(os.path.join(FILEPATH,'test.csv')).set_index('id')
df_original=pd.read_csv('/kaggle/input/horse-survival-dataset/horse.csv')
indexx=np.arange(df_train.index[-1]+1, df_train.index[-1] + df_original.shape[0]+1)
df_original.index=indexx
df_train=pd.concat([df_train,df_original],axis=0).sample(frac = 1)


#Defining categorical and numerical columns
df_train['hospital_number']=df_train['hospital_number'].astype('object')
df_test['hospital_number']=df_test['hospital_number'].astype('object')
cat_cols=df_train.select_dtypes(include='object').columns.to_list()[:-1]
num_cols=df_train.select_dtypes(include=['int64','float64']).columns.to_list()
target_col='outcome'

#checking null columns
#df_train.isna().sum(),df_test.isna().sum()
#Columns with null values
null_columns=[ 'pain','abdomo_protein','rectal_exam_feces','abdomen','packed_cell_volume',
              'total_protein','peristalsis' ,'abdominal_distention','nasogastric_tube',
              'nasogastric_reflux','nasogastric_reflux_ph','rectal_temp','pulse','respiratory_rate',
              'temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time']

In [None]:
#plot_hist(df_train,df_test,num_cols,2)
#A few columns are showing normal distribution but lesion_3 should be removed.

In [None]:
#plot_map(df_train,cat_cols,2)

In [None]:
#Checking whether dataset is balanced and its not.
pie=df_train.groupby(target_col).apply(len) 
plt.pie(pie.values, labels=pie.index,colors=sns.color_palette('bright'),explode=[0.01, 0.01,0.01],autopct='%.0f%%')

In [None]:
#Identifying regression between numerical features.
#plot_heatmap(df_train,title='Train')

In [None]:
#Data preprocessing
X_train = df_train.drop([f'{target_col}'],axis=1).reset_index(drop=True)
y_train = df_train[[f'{target_col}']].reset_index(drop=True)
X_test=df_test.copy().reset_index(drop=True)

#imputation
# X_train=df_Impute(X_train)
# X_test=df_Impute(X_test)

#category_encoding
X_train, X_test, cat_cols = cat_encoder(X_train, X_test, cat_cols)
y_train['outcome']= y_train['outcome'].map({'died':0,'euthanized':1,'lived':2})

#Drop columns
drop_cols = ['lesion_3','lesion_2']
X_train.drop(drop_cols, axis=1, inplace=True)
X_test.drop(drop_cols, axis=1, inplace=True)

#Filling the null values and standard scaling is essential for tensorflow.
X_train=X_train.fillna(0)
X_test=X_test.fillna(0)
scale=preprocessing.StandardScaler()

In [None]:
X_train=feature_eng(X_train)
X_test=feature_eng(X_test)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import regularizers

from tensorflow import keras
early_stopping = keras.callbacks.EarlyStopping(
    patience=15,
    min_delta=0.001,
    restore_best_weights=True,
)

#It is better to reduce the learning rate as we do training.
lr_schedule = keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=X_train.shape[0]*20,
  decay_rate=1,
  staircase=False)

def tf_model():
    model_tf=tf.keras.Sequential([
        keras.layers.Input(shape=[25,]),
        keras.layers.Dense(256, activation='relu',kernel_regularizer=regularizers.l2(0.003)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.003),),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001),),
        keras.layers.Dense(8, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(3, activation='softmax')])
        
    model_tf.compile(optimizer= keras.optimizers.Adam(lr_schedule),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model_tf
    

In [None]:
# 'objective': 'multiclass',
#          'metric': 'multi_logloss',
#          'num_class': 3,
#          'feature_pre_filter': False, 
#          'lambda_l1': 2.2101952327384032e-07, 
#          'lambda_l2': 1.2761437402234661e-08, 
#          'num_leaves': 248, 
#          'feature_fraction': 0.7200000000000001, 
#          'bagging_fraction': 1.0, 
#          'bagging_freq': 0,
#          'min_child_samples': 20, 
#          'n_iter': 100,
#          'verbose':0,

In [None]:
class MultiClass:
    def __init__(self):
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        
        lgb_params= {
            'objective': 'multiclass',
            'metric': 'auc_mu', 
            'num_class': 3,
            'feature_pre_filter': False, 
            'lambda_l1': 1.0309196948445505e-05,
            'lambda_l2': 0.034674458424846455, 
            'num_leaves': 52,
            'feature_fraction': 0.8480000000000001,
            'bagging_fraction': 0.9622932514137276, 
            'bagging_freq': 6, 
            'min_child_samples': 20, 
            'num_iterations': 50,
            'early_stopping_round': None
         
        }

        xgb_params = {
        'n_estimators': 5000,
        'n_jobs': -1,
        'max_depth': 5,
        'eta': 0.2,
        'colsample_bytree': 0.8,
        'objective': 'multi:softprob',
        'num_class': 3,
        'alpha': 8e-07,
        'lambda': 0.0012,
        'early_stopping_rounds':10,
        'verbose':10
        }

        cat_params = {
        'n_estimators': 5000,
        'depth': 8,
        'leaf_estimation_method':'Gradient',
        'loss_function':'MultiClass',
        'objective': 'MultiClass',
        'learning_rate': 0.1,
        'verbose': 0,
        'l2_leaf_reg': 0.0015,
        "eval_metric" : 'TotalF1'
        }
        
        models = {
            'xgb': XGBClassifier(**xgb_params),
            'cat': CatBoostClassifier(**cat_params),
            'lgbm':   LGBMClassifier(**lgb_params),
            #'tf_model':tf_model()
        }
        
        return models

In [None]:
kfold = True
n_splits = 10
random_state_list = [42] 
multiclass = MultiClass()
splitter = Splitter(kfold='skf', n_splits=n_splits)
score_dict = dict(zip(multiclass.models_name, [[] for _ in range(multiclass.len_models)]))
unique_targets = np.unique(np.arange(1,30))
test_predss = np.zeros((X_test.shape[0]))
final=[]
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_train, y_train, random_state_list=random_state_list)):
    multi=MultiClass()
    models=multi.models
    test_preds = []
    oof_preds = []
    
    for name, model in models.items():
        if name =='tf_model':
            y_ = tf.keras.utils.to_categorical(y_train_)
            _y=  tf.keras.utils.to_categorical(y_val)
            model.fit(X_train_,y_,validation_data=[X_val,_y],epochs=200,
                     callbacks=[early_stopping],batch_size=50)
            y_val_pred = np.argmax(model.predict(X_val),axis=1)
            test_pred = np.argmax(model.predict(X_test),axis=1)
            
        elif name in ['lgbm']:
            model.fit(X_train_,y_train_,
                        eval_set=[(X_val,y_val)])
            y_val_pred = model.predict(X_val).reshape(-1)
            test_pred = model.predict(X_test).reshape(-1)
        elif name in ['xgb','cat']:
            model.fit(X_train_, y_train_,eval_set=[(X_val, y_val)])
            y_val_pred = model.predict(X_val).reshape(-1)
            test_pred = model.predict(X_test).reshape(-1)
            
          
            
        else:
            model.fit(X_train_, y_train_.values.ravel())
            y_val_pred = model.predict(X_val).reshape(-1)
            test_pred = model.predict(X_test).reshape(-1)
            
        
        
        
        score= [f1_score(y_val, y_val_pred,average='micro')]
        score_dict[name].append(np.mean(score))
        
        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)

    final.append(fold(test_preds,multi.len_models))
        
test_predss=fold(final,n_splits)    

In [None]:
score_board=pd.DataFrame(score_dict,index=['fold'+ str(i) for i in range(0,n_splits, 1)])
score_board.loc['mean']=score_board.mean()
styled_df=score_board.style
styled_df = styled_df.set_properties(**{'text-align': 'center'})
styled_df = styled_df.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center')]}])
styled_df = styled_df.set_caption('Score_Sheet')
styled_df.set_properties(**{'background-color': 'black',
                           'color': 'lawngreen',
                           'border-color': 'white'})

In [None]:
df_submission=pd.read_csv(os.path.join(FILEPATH,'sample_submission.csv')).set_index('id')
df_submission['outcome']=test_predss
df_submission['outcome']=df_submission['outcome'].map({0:'died',1:'euthanized',2:'lived'})

In [None]:
df_submission.to_csv('submission_tmildxx.csv')