In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import matthews_corrcoef, log_loss
import warnings
warnings.filterwarnings("ignore")

def evaluate(model_name):

    data_dir = "."
    random_states = [7, 42, 259, 2007, 20254]

    metric_dict = {
        "Seed": [],
        "Train Loss": [],
        "Train MCC": [],
        "Validation Loss": [],
        "Validation MCC": [],
    }

    for random_state in random_states:
        metric_dict["Seed"].append(random_state)

        train_df = pd.read_csv(f"{data_dir}/train.csv")

        # Encoding the target variable
        y_train = train_df["Target"].astype('category').cat.codes.to_numpy()

        # Dropping the Target columns from the features
        train_feats = train_df.drop(columns=["Target"])

        # --------------------------------------------------------

        num_cols = train_feats.select_dtypes(include=["float64", "int64"]).columns.tolist()
        num_cols.remove("ID")

        # STANDARD SCALING
        standard_scaler_cols = ["Zeta"]
        standard_scaler = StandardScaler()

        ## Scaling numerical features in the training set
        scaled_train_feat = standard_scaler.fit_transform(train_feats[standard_scaler_cols])
        scaled_train_df = pd.DataFrame(
            scaled_train_feat,
            columns=standard_scaler.get_feature_names_out(standard_scaler_cols)
        )
        train_feats = pd.concat([train_feats.drop(columns=standard_scaler_cols), scaled_train_df], axis=1)


        # MIN-MAX SCALING
        minmax_scaler_cols = list(set(num_cols) - set(standard_scaler_cols))
        minmax_scaler = MinMaxScaler()

        ## Scaling numerical features in the training set
        scaled_train_feat = minmax_scaler.fit_transform(train_feats[minmax_scaler_cols])
        scaled_train_df = pd.DataFrame(
            scaled_train_feat,
            columns=minmax_scaler.get_feature_names_out(minmax_scaler_cols)
        )
        train_feats = pd.concat([train_feats.drop(columns=minmax_scaler_cols), scaled_train_df], axis=1)

        # --------------------------------------------------------

        cat_cols = train_feats.select_dtypes(include=["object"]).columns.tolist()
        label_encoding_cols = ["Surface_Charge"]
        # label_encoding_cols = []
        one_hot_encoding_cols = list(set(cat_cols) - set(label_encoding_cols))

        # ONE-HOT ENCODING
        one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')

        ## Encoding the categorical columns in the training set
        encoded_train_features = one_hot_encoder.fit_transform(train_feats[one_hot_encoding_cols])
        encoded_df = pd.DataFrame(
            encoded_train_features,
            columns=one_hot_encoder.get_feature_names_out(one_hot_encoding_cols)
        )
        train_feats = pd.concat([train_feats.drop(columns=one_hot_encoding_cols), encoded_df], axis=1)


        # LABEL ENCODING
        label_encoder = LabelEncoder()

        # Encoding the column
        train_feats["Surface_Charge"] = label_encoder.fit_transform(train_feats["Surface_Charge"])


        # TRAIN TEST SPLIT
        X_train = train_feats.drop(columns=["ID"]).to_numpy()
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=random_state)
        X_train.shape, y_train.shape, X_val.shape, y_val.shape

        # FITTING THE MODEL
        model_dict = {
            "Logistic Regression": LogisticRegression(random_state=random_state),
            "Decision Tree Classifier": DecisionTreeClassifier(random_state=random_state),
            "Support Vector Classifier": SVC(random_state=random_state, probability=True),
            "Random Forest Classifier": RandomForestClassifier(random_state=random_state),
            "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=random_state),
        }
        model = model_dict[model_name]
        model.fit(X_train, y_train)

        train_loss = log_loss(y_train, model.predict_proba(X_train))
        metric_dict["Train Loss"].append(round(train_loss, 3))
        train_mcc = matthews_corrcoef(y_train, model.predict(X_train))
        metric_dict["Train MCC"].append(round(train_mcc, 3))
        val_loss = log_loss(y_val, model.predict_proba(X_val))
        metric_dict["Validation Loss"].append(round(val_loss, 3))
        val_mcc = matthews_corrcoef(y_val, model.predict(X_val))
        metric_dict["Validation MCC"].append(round(val_mcc, 3))

    metric_dict["Seed"].append("Avg.")
    metric_dict["Train Loss"].append(round(np.mean(metric_dict["Train Loss"][:-1]), 3))
    metric_dict["Train MCC"].append(round(np.mean(metric_dict["Train MCC"][:-1]), 3))
    metric_dict["Validation Loss"].append(round(np.mean(metric_dict["Validation Loss"][:-1]), 3))
    metric_dict["Validation MCC"].append(round(np.mean(metric_dict["Validation MCC"][:-1]), 3))

    return pd.DataFrame(metric_dict)


# model_name = "Logistic Regression"
model_name = "Decision Tree Classifier"
# model_name = "Support Vector Classifier"
# model_name = "Random Forest Classifier"
# model_name = "Gradient Boosting Classifier"
print(f"Model: {model_name.upper()}")
print("MinMax Scaler: Zeta")
metric_df = evaluate(model_name)
metric_df

Model: DECISION TREE CLASSIFIER
MinMax Scaler: Zeta


Unnamed: 0,Seed,Train Loss,Train MCC,Validation Loss,Validation MCC
0,7,0.0,1.0,5.67,0.686
1,42,0.001,0.999,5.67,0.685
2,259,0.001,0.999,7.29,0.598
3,2007,0.001,0.999,5.67,0.683
4,20254,0.001,0.999,7.087,0.607
5,Avg.,0.001,0.999,6.075,0.663
