In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
# Dataset path
data_path = "../data/secondary_data.csv"

# Load dataset with correct delimiter
df = pd.read_csv(data_path, sep=";")

# Basic checks
df.head(), df.shape



(  class  cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed  \
 0     p         15.26         x           g         o                    f   
 1     p         16.60         x           g         o                    f   
 2     p         14.07         x           g         o                    f   
 3     p         14.17         f           h         e                    f   
 4     p         14.64         x           h         o                    f   
 
   gill-attachment gill-spacing gill-color  stem-height  ...  stem-root  \
 0               e          NaN          w        16.95  ...          s   
 1               e          NaN          w        17.99  ...          s   
 2               e          NaN          w        17.80  ...          s   
 3               e          NaN          w        15.77  ...          s   
 4               e          NaN          w        16.53  ...          s   
 
   stem-surface stem-color veil-type veil-color has-ring ring-type  \
 0

In [4]:
# Target and features
target_col = "class"
y = df[target_col]
X = df.drop(columns=[target_col])

# Class distribution
y.value_counts(), y.value_counts(normalize=True)


(class
 p    33888
 e    27181
 Name: count, dtype: int64,
 class
 p    0.554913
 e    0.445087
 Name: proportion, dtype: float64)

In [5]:
# Missing value analysis
missing_counts = X.isna().sum()
missing_ratio = (missing_counts / len(X)).sort_values(ascending=False)

missing_counts[missing_counts > 0], missing_ratio[missing_ratio > 0]


(cap-surface          14120
 gill-attachment       9884
 gill-spacing         25063
 stem-root            51538
 stem-surface         38124
 veil-type            57892
 veil-color           53656
 ring-type             2471
 spore-print-color    54715
 dtype: int64,
 veil-type            0.947977
 spore-print-color    0.895954
 veil-color           0.878613
 stem-root            0.843931
 stem-surface         0.624277
 gill-spacing         0.410405
 cap-surface          0.231214
 gill-attachment      0.161850
 ring-type            0.040462
 dtype: float64)

In [6]:
# Feature type separation
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

len(categorical_cols), len(numeric_cols), categorical_cols, numeric_cols


(17,
 3,
 ['cap-shape',
  'cap-surface',
  'cap-color',
  'does-bruise-or-bleed',
  'gill-attachment',
  'gill-spacing',
  'gill-color',
  'stem-root',
  'stem-surface',
  'stem-color',
  'veil-type',
  'veil-color',
  'has-ring',
  'ring-type',
  'spore-print-color',
  'habitat',
  'season'],
 ['cap-diameter', 'stem-height', 'stem-width'])

In [7]:
from sklearn.impute import SimpleImputer

# Preprocessing for categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# Preprocessing for numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_cols),
        ("num", numeric_transformer, numeric_cols)
    ]
)

preprocessor


0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Baseline model with most frequent strategy
baseline_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", DummyClassifier(strategy="most_frequent"))
    ]
)

# Train baseline
baseline_model.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline_model.predict(X_test)

# Metrics
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline, pos_label="p")

baseline_accuracy, baseline_f1


(0.5549369575896512, 0.7137742207245156)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Logistic Regression pipeline
logreg_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            random_state=42
        ))
    ]
)

# Train
logreg_model.fit(X_train, y_train)

# Predict
y_pred_logreg = logreg_model.predict(X_test)

# Metrics
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
logreg_precision = precision_score(y_test, y_pred_logreg, pos_label="p")
logreg_recall = recall_score(y_test, y_pred_logreg, pos_label="p")
logreg_f1 = f1_score(y_test, y_pred_logreg, pos_label="p")

logreg_accuracy, logreg_precision, logreg_recall, logreg_f1



(0.8441133125921074,
 0.8831761006289308,
 0.8287105340808498,
 0.8550768762368701)