# Import Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from collections import Counter
from imblearn.over_sampling import SMOTE

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
df = pd.read_csv("/mnt/hdd/Datasets/covtype.csv")
df.head()

In [None]:
df.shape

In [None]:
total_null = df.isnull().sum().sum()
total_duplicated = df.duplicated().sum()

print("Number of null values:", total_null)
print("Number of duplicated values:", total_duplicated)

# EDA

In [None]:
target = "Cover_Type"
numerical_columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col]) and col != target]
categorical_columns = [col for col in df.columns if pd.api.types.is_categorical_dtype(df[col]) and col != target]

print("Total Numerical Columns:", len(numerical_columns))
print("Total Categorical Columns:", len(categorical_columns))

# Target Value Distribution

In [None]:
def plot_count(df, col, title):
    fig, ax = plt.subplots(1 ,2, figsize=(18, 6))
    plt.subplots_adjust(wspace=0.2)

    values = df[col].value_counts()
    N = len(values)

    outer_pie = values
    inner_pie = values / N

    ax[0].pie(
        outer_pie,
        labels=values.index.tolist(),
        startangle=90,
        frame=False,
        radius=1.3,
        explode=([0.05] * (N-1) + [0.3]),
        wedgeprops={"linewidth": 1, "edgecolor": "white"},
        textprops={"fontsize": 12, "weight": "bold"}
    )

    ax[0].pie(
        inner_pie,
        radius=1,
        startangle=90,
        autopct="%1.f%%",
        explode=([0.1] * (N-1) + [0.3]),
        pctdistance=0.8,
        textprops={"size": 13, "weight": "bold", "color": "white"}
    )

    center_circle = plt.Circle((0, 0), 0.7, color="black", fc="white", linewidth=0)
    ax[0].add_artist(center_circle)

    sns.barplot(x=values, y=values.index.tolist(), orient="horizontal")

    for i, v in enumerate(values):
        ax[1].text(v, i+0.1, str(v), color="black", fontweight="bold", fontsize=13)

    plt.setp(ax[1].get_yticklabels(), fontweight="bold")
    plt.setp(ax[1].get_xticklabels(), fontweight="bold")
    ax[1].set_xlabel(col, fontweight="bold", color="black")
    ax[1].set_ylabel("count", fontweight="bold", color="black")

    fig.suptitle(f"{title}", fontsize=18, fontweight="bold")
    plt.tight_layout()
    plt.show()

In [None]:
plot_count(df, "Cover_Type", "Target Value Distribution")

# Numerical Columns

In [None]:
def plot_num(df, columns, label):
    for i, column in enumerate(columns):
        plt.subplot(int(len(columns) / 2)+1, 2, i+1)
        sns.histplot(x=column, hue=label, data=df, bins=30, kde=True)
        plt.axvline(df[column].mean(), color="r", linestyle="--", label="Mean")
        plt.axvline(df[column].median(), color="g", linestyle="-", label="Median")
        plt.grid()
        plt.title(f"{column} Distribution")
        plt.tight_layout()

    plt.show()

In [None]:
plt.figure(figsize=(30, len(numerical_columns) * 2.5))
plot_num(df, numerical_columns, "Cover_Type")

# Outlier Detection using Tukey's IQR

In [None]:
def custom_boxplots(df, columns, rows, cols, title):
    fig, ax = plt.subplots(rows, cols, sharey=True, figsize=(30, len(columns) * 2.5))
    fig.suptitle(title, y=1, size=25)
    ax = ax.flatten()
    for i, column in enumerate(columns):
        sns.boxplot(data=df[column], orient="h", ax=ax[i])
        ax[i].set_title(column + ", skewness is: " + str(round(df[column].skew(axis=0, skipna=True), 2))) 

    plt.tight_layout()
    plt.show()

In [None]:
custom_boxplots(df=df, columns=numerical_columns, rows=1+ int(len(numerical_columns)/4), cols=4, title="Boxplots for each variable")

In [None]:
def IQR(df, n, columns):
    outlier_list = []

    for column in columns:
        Q1 = np.percentile(df[column], 25)
        Q3 = np.percentile(df[column], 75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_list_column = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step)].index
        outlier_list.extend(outlier_list_column)

    outlier_list = Counter(outlier_list)
    multiple_outliers = list(k for k, v in outlier_list.items() if v > n)

    df1 = df[df[column] < Q1 - outlier_step]
    df2 = df[df[column] > Q3 + outlier_step]
    print("Total number of outliers is", df1.shape[0] + df2.shape[0])
    return multiple_outliers

In [None]:
iqr_outliers = IQR(df, 1, numerical_columns)

In [None]:
df = df.drop(iqr_outliers, axis=0).reset_index(drop=True)

In [None]:
print("Shape:", df.shape)

# Multicollinearity

In [None]:
df_corr = df.copy()

In [None]:
def plot_correlation_table(df):
    corr = df.corr()
    fig, axes = plt.subplots(figsize=(20, 20))
    mask = np.triu(np.ones_like(corr))
    sns.heatmap(corr, mask=mask, linewidth=0.6, annot=True, robust=True, center=0, square=True)
    plt.title("Correlation Table")
    plt.show()

In [None]:
plot_correlation_table(df_corr)

# Imbalance Check

In [None]:
plt.figure(figsize=(12, 5))
ax = sns.countplot(data=df, x="Cover_Type")
for container in ax.containers:
    ax.bar_label(container)

plt.show()

# Feature Scaling

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]

In [None]:
#_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("Before SMOTE:", Counter(y))

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print("After SMOTE:", Counter(y_train_resampled))

In [None]:
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train_resampled)
X_test_scaled = mms.transform(X_test)

# Train

In [None]:
kf = KFold(n_splits=5, shuffle=True)

In [None]:
def cross_validation(model, X, y):
    model_log = pd.DataFrame(columns=["Accuracy", "F1", "Precision", "Recall"])
    for fold, (train_, test_) in enumerate(kf.split(X=X, y=y)):
        xtrain = X[train_, :]
        xtest = X[test_, :]
        ytrain = y[train_]
        ytest = y[test_]

        model.fit(xtrain, ytrain)

        y_pred = model.predict(xtest)
        print(f"The fold is: {fold}")

        cm = confusion_matrix(ytest, y_pred)

        model_log.loc[fold, "Accuracy"] = accuracy_score(ytest, y_pred)
        model_log.loc[fold, "F1"] = f1_score(ytest, y_pred, average="weighted")
        model_log.loc[fold, "Precision"] = precision_score(ytest, y_pred, average="weighted")
        model_log.loc[fold, "Recall"] = recall_score(ytest, y_pred, average="weighted")

    return model_log, cm

In [None]:
dt = DecisionTreeClassifier()
dt_log, dt_cm = cross_validation(dt, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=dt_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
rf = RandomForestClassifier()
rf_log, rf_cm = cross_validation(rf, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
ada = AdaBoostClassifier()
ada_log, ada_cm = cross_validation(ada, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=ada_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
svc = SVC()
svc_log, svc_cm = cross_validation(svc, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=svc_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
lsvc = LinearSVC()
lsvc_log, lsvc_cm = cross_validation(lsvc, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=lsvc_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
logreg = LogisticRegression()
logreg_log, logreg_cm = cross_validation(logreg, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=logreg_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
sgdc = SGDClassifier()
sgdc_log, sgdc_cm = cross_validation(sgdc, X_train_scaled, y_train_resampled)
plot_confusion_matrix(conf_mat=sgdc_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

# Results

In [None]:
result_df = pd.DataFrame({
    "Model": ["DT", "XGB", "RF", "LGBM", "DT2", "XGB2", "RF2", "LGBM"],
    "Accuracy": [dt_log["Accuracy"].mean(), rf_log["Accuracy"].mean(), ada_log["Accuracy"].mean(), svc_log["Accuracy"].mean(),
                 lsvc_log["Accuracy"].mean(), logreg_log["Accuracy"].mean(), sgdc_log["Accuracy"].mean()],
    "F1": [dt_log["F1"].mean(), rf_log["F1"].mean(), ada_log["F1"].mean(), svc_log["F1"].mean(),
                 lsvc_log["F1"].mean(), logreg_log["F1"].mean(), sgdc_log["F1"].mean()],
    "Precision": [dt_log["Precision"].mean(), rf_log["Precision"].mean(), ada_log["Precision"].mean(), svc_log["Precision"].mean(),
                 lsvc_log["Precision"].mean(), logreg_log["Precision"].mean(), sgdc_log["Precision"].mean()],
    "Recall": [dt_log["Recall"].mean(), rf_log["Recall"].mean(), ada_log["Recall"].mean(), svc_log["Recall"].mean(),
                 lsvc_log["Recall"].mean(), logreg_log["Recall"].mean(), sgdc_log["Recall"].mean()]
})

result_df.head(7)

In [None]:
result_df.sort_values(by="Accuracy", ascending=False)[["Model", "Accuracy"]]

In [None]:
result_df.sort_values(by="F1", ascending=False)[["Model", "F1"]]

In [None]:
result_df.sort_values(by="Precision", ascending=False)[["Model", "Precision"]]

In [None]:
result_df.sort_values(by="Recall", ascending=False)[["Model", "Recall"]]

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(221)
ax = sns.barplot(data=result_df, x="Model", y="Accuracy")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Accuracy Score")

plt.subplot(222)
ax = sns.barplot(data=result_df, x="Model", y="F1")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / F1 Score")

plt.subplot(223)
ax = sns.barplot(data=result_df, x="Model", y="Precision")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Precision Score")

plt.subplot(224)
ax = sns.barplot(data=result_df, x="Model", y="Recall")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Recall Score")

plt.show()