In [177]:
# This script trains a Random Forest model on the training data with 5-fold cross-validation
import sys
import os

# Add the src folder to the system path
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from clean import RareCategoryReplacer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, accuracy_score
from analyze import missing_values
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef

In [178]:
fold = 0
# Load the training data with folds
df = pd.read_csv("../input/train_folds.csv")

In [179]:
# lets take smaller subset of data for faster training
df = df.sample(frac=0.1, random_state=0)

In [180]:
# df['cap-surface_cap-shape'] = df['cap-surface'] + '_' + df['cap-shape']
# df['gill-attachment_gill-color'] = df['gill-attachment'] + '_' + df['gill-color']
# df['gill-spacing_gill-color'] = df['gill-spacing'] + '_' + df['gill-color']
# df['gill-color_veil-color'] = df['gill-color'] + '_' + df['veil-color']
# df['stem-root_stem-color'] = df['stem-root'] + '_' + df['stem-color']
# df['stem-surface_gill-color'] = df['stem-surface'] + '_' + df['gill-color']
# df['veil-type_cap-shape'] = df['veil-type'] + '_' + df['cap-shape']
# df['veil-color_gill-color'] = df['veil-color'] + '_' + df['gill-color']
# df['spore-print-color_gill-color'] = df['spore-print-color'] + '_' + df['gill-color']

In [181]:
df.drop(columns=['id'], inplace=True)

In [182]:
# Split the data into training and validation sets
train = df[df.kfold != fold].reset_index(drop=True)
valid = df[df.kfold == fold].reset_index(drop=True)


In [183]:
# Split the features and target
X_train = train.drop("class", axis=1)
X_valid = valid.drop("class", axis=1)
y_train = train["class"]
y_valid = valid["class"]

In [184]:
# Get the categorical and numerical columns
cat_cols =X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

In [185]:
# Fill missing values in the numerical columns with the median value
imputer = SimpleImputer(strategy='median')
X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_valid[num_cols] = imputer.transform(X_valid[num_cols])

In [186]:
# # Fill missing values in the categorical columsn with the mode value
# imputer = SimpleImputer(strategy='most_frequent')
# X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])
# X_valid[cat_cols] = imputer.transform(X_valid[cat_cols])

In [187]:
replacer = RareCategoryReplacer(columns=cat_cols, proportion_threshold=0.01)
X_train = replacer.fit_transform(X_train)
X_valid = replacer.transform(X_valid)

In [188]:
X_train['cap-surface_cap-shape'] = X_train['cap-surface'] + '_' + X_train['cap-shape']
X_train['gill-attachment_gill-color'] = X_train['gill-attachment'] + '_' + X_train['gill-color']
X_train['gill-spacing_gill-color'] = X_train['gill-spacing'] + '_' + X_train['gill-color']
X_train['gill-color_veil-color'] = X_train['gill-color'] + '_' + X_train['veil-color']
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']
X_train['stem-surface_gill-color'] = X_train['stem-surface'] + '_' + X_train['gill-color']
X_train['veil-type_cap-shape'] = X_train['veil-type'] + '_' + X_train['cap-shape']
X_train['veil-color_gill-color'] = X_train['veil-color'] + '_' + X_train['gill-color']
X_train['spore-print-color_gill-color'] = X_train['spore-print-color'] + '_' + X_train['gill-color']


X_valid['cap-surface_cap-shape'] = X_valid['cap-surface'] + '_' + X_valid['cap-shape']
X_valid['gill-attachment_gill-color'] = X_valid['gill-attachment'] + '_' + X_valid['gill-color']
X_valid['gill-spacing_gill-color'] = X_valid['gill-spacing'] + '_' + X_valid['gill-color']
X_valid['gill-color_veil-color'] = X_valid['gill-color'] + '_' + X_valid['veil-color']
X_valid['stem-root_stem-color'] = X_valid['stem-root'] + '_' + X_valid['stem-color']
X_valid['stem-surface_gill-color'] = X_valid['stem-surface'] + '_' + X_valid['gill-color']
X_valid['veil-type_cap-shape'] = X_valid['veil-type'] + '_' + X_valid['cap-shape']
X_valid['veil-color_gill-color'] = X_valid['veil-color'] + '_' + X_valid['gill-color']
X_valid['spore-print-color_gill-color'] = X_valid['spore-print-color'] + '_' + X_valid['gill-color']

In [189]:
# Get the categorical and numerical columns
cat_cols =X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

In [190]:
# Fill missing values in the categorical columsn with the mode value
imputer = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])
X_valid[cat_cols] = imputer.transform(X_valid[cat_cols])

### One Hot Encoding

In [191]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[cat_cols])
X_valid_encoded = encoder.transform(X_valid[cat_cols])

# Create a DataFrame with the encoded columns
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(cat_cols))
X_valid_encoded = pd.DataFrame(X_valid_encoded, columns=encoder.get_feature_names_out(cat_cols))

# Drop the original categorical columns from the training and validation sets
X_train = X_train.drop(cat_cols, axis=1)
X_valid = X_valid.drop(cat_cols, axis=1)

# Concatenate the numerical and encoded categorical columns
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_valid = pd.concat([X_valid, X_valid_encoded], axis=1)

### Target Encoding

In [192]:
# from category_encoders import TargetEncoder

# target_mapping = {"p": 0, "e": 1}
# y_train_num = y_train.map(target_mapping)

# # Create an instance of the TargetEncoder
# encoder = TargetEncoder(cols=cat_cols)

# # Fit the encoder on the training data and transform the training and validation data
# X_train = encoder.fit_transform(X_train, y_train_num)
# X_valid = encoder.transform(X_valid)

In [193]:
# Scale the numerical columns using the StandardScaler
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])

In [194]:
# Train a Random Forest model
model = RandomForestClassifier(n_jobs=-1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

In [195]:
# Calculate the Matthews correlation coefficient
mcc = matthews_corrcoef(y_valid, y_pred)
print(f"Fold={fold}, MCC={mcc}")
# Calculate the accuracy
print(f"Fold={fold}, Accuracy={accuracy_score(y_valid, y_pred)}")

Fold=0, MCC=0.9829532657974156
Fold=0, Accuracy=0.9915545422433448


**Target Encoding**:    
Fold=0, MCC=0.983051653860605   
Fold=0, Accuracy=0.991597541823805  

**One Hot Encoding**:   
Fold=0, MCC=0.9834986856205626  
Fold=0, Accuracy=0.9918189124286761 

**Grid Searched parameters**:   
RandomForestClassifier(
    n_jobs=-1,
    criterion='entropy',
    n_estimators=550,
    bootstrap=True,
    max_depth=65,
    max_features='log2',
    max_samples=0.9,
    min_samples_leaf=1,
    min_samples_split=4
)

Fold=0, MCC=0.9836220432750942  
Fold=0, Accuracy=0.9918798695517566 



### Results to check the performance of Feature Engineering

#### Run 1: Without Feature Engineering    
**One Hot Encoding**:         
Fold=0, MCC=0.983307760869349   
Fold=0, Accuracy=0.9917242684744196         
        
**Target Encoding**:      
Fold=0, MCC=0.9831194966418483      
Fold=0, Accuracy=0.9916312286549811     



#### Run 2: With Feature Engineering

All the following results will be on the one hot encoded data.

**Setting 1**:      
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']            
Time taken to run the model: 5m 26.6s           
Fold=0, MCC=0.9833660402922174          
Fold=0, Accuracy=0.991753142901142          


This is taking too much time. What we should do is to use small samples of the data to check the performance of the model. 
In there instead of looking at the score we will observe the change in the score. If the score is increasing then we can use the feature engineering on the whole data.
Let's begin with 10% of the data.

#### Run 3: With Feature Engineering on 10% of the data

Fold=0, MCC=0.9827923642002262
Fold=0, Accuracy=0.9914742622266466


**Setting 1**:
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']            
Time taken to run the model: 13.0s          
Fold=0, MCC=0.9821761344513027      
Fold=0, Accuracy=0.9911691981631933     

**Setting 2**:
X_valid['stem-root_stem-color'] = X_valid['stem-root'] + '_' + X_valid['stem-color']
X_valid['stem-surface_gill-color'] = X_valid['stem-surface'] + '_' + X_valid['gill-color']
Time taken to run the model: 14.4s
Fold=0, MCC=0.9819827597168661
Fold=0, Accuracy=0.9910728621431554

**Setting 3**:
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']        
X_train['stem-surface_gill-color'] = X_train['stem-surface'] + '_' + X_train['gill-color']      
X_train['veil-color_gill-color'] = X_train['veil-color'] + '_' + X_train['gill-color']      
Time taken to run the model: 15.0s      
Fold=0, MCC=0.9816570096455165              
Fold=0, Accuracy=0.9909123021097589             

**Setting 4**:
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']    
X_train['stem-surface_gill-color'] = X_train['stem-surface'] + '_' + X_train['gill-color']  
X_train['veil-color_gill-color'] = X_train['veil-color'] + '_' + X_train['gill-color']  
X_train['spore-print-color_gill-color'] = X_train['spore-print-color'] + '_' + X_train['gill-color']    
Time taken to run the model: 16.0s  
Fold=0, MCC=0.9815588840453281  
Fold=0, Accuracy=0.99086413409974   

**Setting 5**:
X_train['cap-surface_cap-shape'] = X_train['cap-surface'] + '_' + X_train['cap-shape']
X_train['gill-attachment_gill-color'] = X_train['gill-attachment'] + '_' + X_train['gill-color']
X_train['gill-spacing_gill-color'] = X_train['gill-spacing'] + '_' + X_train['gill-color']
X_train['gill-color_veil-color'] = X_train['gill-color'] + '_' + X_train['veil-color']
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']
X_train['stem-surface_gill-color'] = X_train['stem-surface'] + '_' + X_train['gill-color']
X_train['veil-type_cap-shape'] = X_train['veil-type'] + '_' + X_train['cap-shape']
X_train['veil-color_gill-color'] = X_train['veil-color'] + '_' + X_train['gill-color']
X_train['spore-print-color_gill-color'] = X_train['spore-print-color'] + '_' + X_train['gill-color']
Time taken to run the model: 23.7s
Fold=0, MCC=0.9808775898704652
Fold=0, Accuracy=0.9905269580296072

**Setting 6**: Same as setting 5 but done imputation of missing values after feature engineering.       
Time taken to run the model: 28.0s      
Fold=0, MCC=0.9829532657974156      
Fold=0, Accuracy=0.9915545422433448     





In [None]:
X_train['cap-surface_cap-shape'] = X_train['cap-surface'] + '_' + X_train['cap-shape']
X_train['gill-attachment_gill-color'] = X_train['gill-attachment'] + '_' + X_train['gill-color']
X_train['gill-spacing_gill-color'] = X_train['gill-spacing'] + '_' + X_train['gill-color']
X_train['gill-color_veil-color'] = X_train['gill-color'] + '_' + X_train['veil-color']
X_train['stem-root_stem-color'] = X_train['stem-root'] + '_' + X_train['stem-color']
X_train['stem-surface_gill-color'] = X_train['stem-surface'] + '_' + X_train['gill-color']
X_train['veil-type_cap-shape'] = X_train['veil-type'] + '_' + X_train['cap-shape']
X_train['veil-color_gill-color'] = X_train['veil-color'] + '_' + X_train['gill-color']
X_train['spore-print-color_gill-color'] = X_train['spore-print-color'] + '_' + X_train['gill-color']


X_test['cap-surface_cap-shape'] = X_test['cap-surface'] + '_' + X_test['cap-shape']
X_test['gill-attachment_gill-color'] = X_test['gill-attachment'] + '_' + X_test['gill-color']
X_test['gill-spacing_gill-color'] = X_test['gill-spacing'] + '_' + X_test['gill-color']
X_test['gill-color_veil-color'] = X_test['gill-color'] + '_' + X_test['veil-color']
X_test['stem-root_stem-color'] = X_test['stem-root'] + '_' + X_test['stem-color']
X_test['stem-surface_gill-color'] = X_test['stem-surface'] + '_' + X_test['gill-color']
X_test['veil-type_cap-shape'] = X_test['veil-type'] + '_' + X_test['cap-shape']
X_test['veil-color_gill-color'] = X_test['veil-color'] + '_' + X_test['gill-color']
X_test['spore-print-color_gill-color'] = X_test['spore-print-color'] + '_' + X_test['gill-color']