In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import load_model

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('/Users/tarunreddy/Desktop/Projects/Mushroom_Kaggle/clean_mush.csv')
df.shape


(3111859, 118)

In [3]:
# Assuming your DataFrame is named df
print(df.columns.tolist())  # Convert to a list and print

['class', 'cap-diameter', 'does-bruise-or-bleed', 'stem-height', 'stem-width', 'has-ring', 'cap-shape_b', 'cap-shape_c', 'cap-shape_f', 'cap-shape_o', 'cap-shape_p', 'cap-shape_s', 'cap-shape_x', 'cap-surface_d', 'cap-surface_e', 'cap-surface_g', 'cap-surface_h', 'cap-surface_i', 'cap-surface_k', 'cap-surface_l', 'cap-surface_s', 'cap-surface_t', 'cap-surface_unknown', 'cap-surface_w', 'cap-surface_y', 'cap-color_b', 'cap-color_e', 'cap-color_g', 'cap-color_k', 'cap-color_l', 'cap-color_n', 'cap-color_o', 'cap-color_p', 'cap-color_r', 'cap-color_u', 'cap-color_w', 'cap-color_y', 'gill-attachment_a', 'gill-attachment_d', 'gill-attachment_e', 'gill-attachment_p', 'gill-attachment_s', 'gill-attachment_unknown', 'gill-attachment_x', 'gill-spacing_c', 'gill-spacing_d', 'gill-color_b', 'gill-color_e', 'gill-color_f', 'gill-color_g', 'gill-color_k', 'gill-color_n', 'gill-color_o', 'gill-color_p', 'gill-color_r', 'gill-color_u', 'gill-color_w', 'gill-color_y', 'stem-root_b', 'stem-root_c', 'st

In [4]:
# Split the data
X = df.iloc[:, 1:]  # Features (columns 2 to 118)
y = df.iloc[:, 0]   # Target (first column)

# Check the shapes to confirm the split
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (3111859, 117)
Target shape: (3111859,)


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the shapes of the splits
print("Training Features shape:", X_train.shape)
print("Validation Features shape:", X_val.shape)
print("Training Target shape:", y_train.shape)
print("Validation Target shape:", y_val.shape)

Training Features shape: (2489487, 117)
Validation Features shape: (622372, 117)
Training Target shape: (2489487,)
Validation Target shape: (622372,)


In [6]:
# Initialize the scaler
scaler = StandardScaler()

# Assuming the numerical columns are the last 3 in the original data
# If you know the exact indices, you can specify them directly.
numerical_cols = ['cap-diameter', 'stem-height', 'stem-width']  # Adjust if needed

# Fit the scaler on the training data and transform both train and validation sets
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])

# Check the transformed data
print("Transformed Training Features:\n", X_train.head())
print("Transformed Validation Features:\n", X_val.head())

Transformed Training Features:
          cap-diameter  does-bruise-or-bleed  stem-height  stem-width  \
2175463      2.033525                   1.0     1.104270    2.249906   
2500832     -0.041329                   0.0    -0.913793   -0.260785   
1482972      0.425715                   0.0     0.019122    0.104825   
2453502      0.500871                   0.0    -0.300957   -0.481941   
1151101      1.145069                   1.0     0.612440    1.608171   

         has-ring  cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_o  \
2175463       0.0          0.0          0.0          0.0          0.0   
2500832       0.0          0.0          0.0          1.0          0.0   
1482972       1.0          0.0          0.0          0.0          0.0   
2453502       0.0          0.0          0.0          0.0          0.0   
1151101       0.0          0.0          0.0          0.0          0.0   

         cap-shape_p  ...  habitat_g  habitat_h  habitat_l  habitat_m  \
2175463          0.0  .

In [None]:
# Initialize the model
model = Sequential()

# Input layer and first hidden layer
model.add(Dense(256, input_dim=117, activation='relu'))
model.add(BatchNormalization())  # Optional
model.add(Dropout(0.3))  # Optional

# Second hidden layer
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())  # Optional
model.add(Dropout(0.3))  # Optional

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())  # Optional
model.add(Dropout(0.3))  # Optional

model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

# Summary of the model
model.summary()

In [None]:
# Callbacks for early stopping and reducing learning rate
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)
]


In [None]:
# Fit the model on the training data with verbose output
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val),
                    epochs=50,  # Adjust this number as needed
                    batch_size=1024,  # Experiment with batch sizes
                    callbacks=callbacks,
                    verbose=1)  # This will print the loss and metrics for each epoch


In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


## Testing

In [21]:
# Load the CSV file into a DataFrame
df_test = pd.read_csv('/Users/tarunreddy/Desktop/Projects/Mushroom_Kaggle/clean_mush_test.csv')
df_test.shape

(2077964, 118)

In [22]:
X_test = df_test.iloc[:, 1:]  # Features (columns 2 to 118)
id = df_test.iloc[:, 0]   # Target (first column)

# Check the shapes to confirm the split
print("Features shape:", X_test.shape)
print("Target shape:", id.shape)

Features shape: (2077964, 117)
Target shape: (2077964,)


In [23]:
columns_are_equal = (X.columns.tolist() == X_test.columns.tolist())

if columns_are_equal:
    print("Both DataFrames have the same columns in the same order.")
else:
    print("The DataFrames do not have the same columns in the same order.")


Both DataFrames have the same columns in the same order.


In [24]:
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Check the transformed data
print("Transformed Training Features:\n", X_test.head())

Transformed Training Features:
    cap-diameter does-bruise-or-bleed  stem-height  stem-width has-ring  \
0      0.664605                    t     1.877145    0.772127        t   
1      0.197561                    f    -1.971617   -0.042186        f   
2     -1.117676                    f    -0.055043   -1.015015        f   
3     -0.723105                    f    -0.523452   -0.328538        t   
4      0.001618                    f     0.159645    0.334929        t   

   cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_o  cap-shape_p  ...  \
0          0.0          0.0          0.0          0.0          0.0  ...   
1          0.0          0.0          0.0          1.0          0.0  ...   
2          1.0          0.0          0.0          0.0          0.0  ...   
3          0.0          0.0          0.0          0.0          0.0  ...   
4          0.0          0.0          0.0          0.0          0.0  ...   

   habitat_g  habitat_h  habitat_l  habitat_m  habitat_p  habitat_u  \
0

In [29]:
best_model = load_model('best_model.h5')

In [None]:
# Make predictions
predictions = best_model.predict(X_test)

In [None]:
# Convert predictions to class labels ('p' or 'e')
class_labels = ['p' if pred >= 0.5 else 'e' for pred in predictions]

# Create a new DataFrame with the id and class labels
results_df = pd.DataFrame({
    'id': id,
    'class': class_labels
})

# Save the DataFrame to a CSV file
results_df.to_csv('predictions_1.csv', index=False)

print("CSV file with predictions created successfully.")

In [None]:
print(results_df.head())