In [1]:
# data source:
# https://archive.ics.uci.edu/dataset/73/mushroom

# useful for loading the dataset
# !pip install ucimlrepo

In [2]:
%load_ext autoreload
%autoreload 2

# Imports of necessary libraries and transformers

In [None]:
import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('src')
os.getcwd()

In [4]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from src.custom_transformers import (
    DropColumnTransformer,
    CustomLabelEncoder,
    CustomOneHotEncoder,
)


In [5]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns, order = None):
        self.columns = columns
        self.order = order if order is not None else {}
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            if column in self.order:
                self.encoders[column] = OrdinalEncoder(categories=[self.order[column]])
            else:
                self.encoders[column] = OrdinalEncoder()
            self.encoders[column].fit(X[[column]])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = self.encoders[column].transform(X[[column]])
        return X_transformed

# Loading the dataset

In [6]:
mushroom_dataset_id = 73

In [7]:
# fetch dataset 
mushroom = fetch_ucirepo(id=mushroom_dataset_id) 
  
# data (as pandas dataframes) 
df = pd.concat([mushroom.data.features, mushroom.data.targets], axis=1)

In [None]:
df.head()

# Data cleaning

In [None]:
# checking data dimensions
print(df.shape)

In [None]:
# checking column names and data types
df.dtypes

In [None]:
# checking null values
print(df.isnull().sum())

In [None]:
# checking for duplicates
df.duplicated().any()

In [13]:
cleaning = make_pipeline(
    DropColumnTransformer(columns=["stalk-root", # removing it due to a large number of null values
                                   "veil-type"]) # removing it due to results of categorical variability analysis
)

In [14]:
df_cleaned = cleaning.fit_transform(df)

In [None]:
df_cleaned.head()

# Data preprocessing

In [None]:
preprocessing = make_pipeline(
CustomLabelEncoder(columns=["bruises", "veil-type"]),
    
    CustomOrdinalEncoder(
        columns=["gill-size", "stalk-shape", "ring-number", "population"],
        order={
            "gill-size": ["n", "b"],  # narrow < broad
            "stalk-shape": ["t", "e"],  # tapering < enlarging
            "ring-number": ["n", "o", "t"],  # none < one < two
            "population": ["y", "v", "s", "n", "c", "a"]  # solitary < several < scattered < numerous < clustered < abundant
        }
    ),
    
    CustomOneHotEncoder(columns=[
        "cap-shape", "cap-surface", "cap-color", "odor", "gill-attachment",
        "gill-spacing", "gill-color", "stalk-surface-above-ring",
        "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring",
        "veil-color", "ring-type", "spore-print-color", "habitat"
    ])
)
    

In [17]:
df_preprocessed = preprocessing.fit_transform(df_cleaned)

In [None]:
df_preprocessed.head()

In [19]:
# export of the preprocessed data to .csv file
# df_preprocessed.to_csv('projects/proj_1_team_1/mushrooms_preprocessed.csv')