<a href="https://www.kaggle.com/code/vaishakhraveendran/eda-season-3-episode-22?scriptVersionId=142743442" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
#Model Imports
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, KFold

In [None]:
#Correlation plots
def plot_heatmap(df, title):
    # Create a mask for the diagonal elements
    df=df.drop([target_col],axis=1)
    df=df[num_cols]
    mask = np.zeros_like(df.astype(float).corr())
    mask[np.triu_indices_from(mask)] = True

    # Set the colormap and figure size
    colormap = plt.cm.RdBu_r
    plt.figure(figsize=(16, 16))

    # Set the title and font properties
    plt.title(f'{title} Correlation of Features', fontweight='bold', y=1.02, size=20)

    # Plot the heatmap with the masked diagonal elements
    sns.heatmap(df.astype(float).corr(), linewidths=0.1, vmax=1.0, vmin=-1.0, 
                square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 14, "weight": "bold"},
                mask=mask)

In [None]:
#warning suppress
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#Ensembling
from scipy import stats as st
def fold(fold_list,K):
    stack=np.column_stack((fold_list[i] for i in range(K)))
    mode=st.mode(stack,axis=1)
    return mode[0]

In [None]:
class Splitter:
    def __init__(self, kfold=True, n_splits=5, cat_df=pd.DataFrame()):
        self.n_splits = n_splits
        self.kfold = kfold
        self.cat_df = cat_df

    def split_data(self, X, y, random_state_list):
        if self.kfold == 'skf':
            for random_state in random_state_list:
                kf = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, self.cat_df):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        elif self.kfold:
            for random_state in random_state_list:
                kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        else:
            raise ValueError(f"Invalid kfold: Must be True")

In [None]:
#Category encoder
from category_encoders import OrdinalEncoder
def cat_encoder(X_train, X_test, cat_cols):
        encoder = OrdinalEncoder(cols=cat_cols, handle_missing='ignore')
        train_encoder = encoder.fit_transform(X_train[cat_cols]).astype(int)
        test_encoder = encoder.transform(X_test[cat_cols]).astype(int)
        X_train[cat_cols] = train_encoder[cat_cols]
        X_test[cat_cols] = test_encoder[cat_cols]
        encoder_cols = cat_cols
        return X_train, X_test, encoder_cols

In [None]:
def plot_map(df_train,catCols,n_cols=2):
    n_rows=(len(catCols))//n_cols
    fig, axes = plt.subplots(n_rows,n_cols,figsize=(18,6*n_rows))
    ax=axes.flatten()
    for i,col in enumerate(catCols):
        if col !=target_col:
            sns.heatmap(data = pd.crosstab(df_train[col], df_train[target_col]),
                    annot = True, fmt = '.0f', ax = ax[i])
            ax[i].set_title(f'{col} Distribution (Train)')

In [None]:
def plot_hist(df_train,df_test,numCols,n_cols=2):
    n_rows=(len(numCols)-1)//n_cols
    fig, axes = plt.subplots(n_rows,n_cols,figsize=(18,6*n_rows))
    ax=axes.flatten()
    numCols.remove('hospital_number')
    for i,col in enumerate(numCols):
            sns.histplot(df_train[col],ax=ax[i],kde=True)
            sns.histplot(df_test[col],ax=ax[i],kde=True)
            ax[i].set_title(f'{col} Distribution (Train v/s Test)')
            

In [None]:
FILEPATH ='/kaggle/input/playground-series-s3e22'
#Reading the train and test sets
df_train=pd.read_csv(os.path.join(FILEPATH,'train.csv')).set_index('id')
df_test=pd.read_csv(os.path.join(FILEPATH,'test.csv')).set_index('id')

#Defining categorical and numerical columns
cat_cols=df_train.select_dtypes(include='object').columns.to_list()[:-1]
num_cols=df_train.select_dtypes(include=['int64','float64']).columns.to_list()
target_col='outcome'

#checking null columns
#df_train.isna().sum(),df_test.isna().sum()

In [None]:
#plot_hist(df_train,df_test,num_cols,2)
#A few columns are showing normal distribution but lesion_3 should be removed.

In [None]:
#plot_map(df_train,cat_cols,2)

In [None]:
#Checking whether dataset is balanced and its not.
pie=df_train.groupby(target_col).apply(len) 
plt.pie(pie.values, labels=pie.index,colors=sns.color_palette('bright'),explode=[0.01, 0.01,0.01],autopct='%.0f%%')

In [None]:
#Identifying regression between numerical features.
#plot_heatmap(df_train,title='Train')

In [None]:
#Data preprocessing
X_train = df_train.drop([f'{target_col}'],axis=1).reset_index(drop=True)
y_train = df_train[[f'{target_col}']].reset_index(drop=True)
X_test=df_test.copy().reset_index(drop=True)

In [None]:
#category_encoding
X_train, X_test, cat_cols = cat_encoder(X_train, X_test, cat_cols)
y_train['outcome']= y_train['outcome'].map({'died':0,'euthanized':1,'lived':2})

In [None]:
#Drop columns
drop_cols = ['hospital_number','lesion_3']
X_train.drop(drop_cols, axis=1, inplace=True)
X_test.drop(drop_cols, axis=1, inplace=True)

In [None]:
#imputations


In [None]:
class MultiClass:
    def __init__(self):
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        
        lgb_params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 3,
        'lambda_l1': 5e-07,
        'lambda_l2': 0.001,
        'num_leaves': 32,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.9,
        'min_child_samples': 45,
        'num_iterations': 5000,
        'learning_rate': 0.2
        }

        xgb_params = {
        'n_estimators': 5000,
        'n_jobs': -1,
        'max_depth': 5,
        'eta': 0.2,
        'colsample_bytree': 0.8,
        'objective': 'multi:softprob',
        'num_class': 3,
        'alpha': 8e-07,
        'lambda': 0.0012
        }

        cat_params = {
        'n_estimators': 5000,
        'depth': 5,
        'learning_rate': 0.2,
        'verbose': 0,
        'l2_leaf_reg': 0.0015
        }

        models = {
            'xgb': XGBClassifier(**xgb_params),
            'cat': CatBoostClassifier(**cat_params),
            'lgbm':   LGBMClassifier(**lgb_params)
        }
        
        return models

In [None]:
kfold = True
n_splits = 5
random_state_list = [42] 
multiclass = MultiClass()
splitter = Splitter(kfold=kfold, n_splits=n_splits)
score_dict = dict(zip(multiclass.models_name, [[] for _ in range(multiclass.len_models)]))
unique_targets = np.unique(np.arange(1,30))
test_predss = np.zeros((X_test.shape[0]))
final=[]
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_train, y_train, random_state_list=random_state_list)):
    multi=MultiClass()
    models=multi.models
    test_preds = []
    oof_preds = []
    
    for name, model in models.items():
        model.fit(X_train_, y_train_)
        y_val_pred = model.predict(X_val).reshape(-1)
        test_pred = model.predict(X_test).reshape(-1)
        
        
        score= [f1_score(y_val, y_val_pred,average='micro')]
        score_dict[name].append(np.mean(score))
        
        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)

    final.append(fold(test_preds,multi.len_models))
        
test_predss=fold(final,n_splits)    

In [None]:
df_submission=pd.read_csv(os.path.join(FILEPATH,'sample_submission.csv')).set_index('id')
df_submission['outcome']=test_predss
df_submission['outcome']=df_submission['outcome'].map({0:'died',1:'euthanized',2:'lived'})

In [None]:
df_submission.to_csv('submission_base.csv')