In [14]:
# Vikas Singh Narwariya
# Roll no.- 231030065
# pandas: Library for handling and analyzing data in tables.
# numpy: Library for numerical operations and working with arrays.
# sklearn (train_test_split): Used to split data into training and testing sets.
# sklearn (StandardScaler, LabelEncoder): StandardScaler normalizes data; LabelEncoder converts labels to numbers.
# sklearn (classification_report, accuracy_score): Used to evaluate the performance of the model (report accuracy, etc.).
# keras (Sequential): Sequential allows you to build models layer by layer.
# keras (Dense, Dropout, BatchNormalization, LeakyReLU): 
# Dense: fully connected layer, Dropout: prevents overfitting, BatchNormalization: speeds up training, LeakyReLU: activation function.
# keras (ModelCheckpoint): Saves the best model weights during training if accuracy improves.
# re: Library for working with regular expressions. It is a tool that helps you search for specific words, numbers, or patterns inside text.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from keras.callbacks import ModelCheckpoint
import re

# Helper function to enable natural sorting
# The function breaks a string into digits and non-digit parts
# Take an example, for example, "image10" becomes ['image', 10] for proper sorting
def natural_sort_key(text):
    return [int(chunk) if chunk.isdigit() else chunk for chunk in re.split(r'(\d+)', text)]

# Load dataset and prepare for training
# pd.read_csv() is used to read the CSV file containing features and target labels
# This dataset contains features extracted from images for classification (ravelling vs non-ravelling)
df = pd.read_csv('images_for_training.csv')

# Seperate features and target labels
# I use .iloc to slice the DataFrame:
# features: All columns except the last one because in last column target labels are mentioned. 
# target: The last column (the classification label)
features = df.iloc[:, :-1].values  # First 34 columns (features)
target = df.iloc[:, -1].values    # Last column (target labels)

# Encode target labels to binary (0 or 1)
# LabelEncoder converts categorical labels like 'Raveling' and 'Non_raveling' into numeric labels (1 and 0)
encoder = LabelEncoder()
encoded_target = encoder.fit_transform(target)  # Encodes labels into 0 (Non_raveling) and 1 (Raveling)

# Split the dataset into training and testing subsets
# train_test_split divides the dataset into two parts:
# Training data (80%) for training the model
# Testing data (20%) for evaluating the model performance
X_train, X_test, y_train, y_test = train_test_split(features, encoded_target, test_size=0.2, random_state=42, shuffle=True)

# Standardize features to make them have mean = 0 and standard deviation = 1
# This improves the training speed and accuracy of the neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit the scaler to the training data and scale it
X_test_scaled = scaler.transform(X_test)  # Use the same scaler on test data to transform it

# Initialize the Sequential model
# Sequential is a linear stack of layers for building a neural network model
model = Sequential()

# Add input layer and first hidden layer
# Input layer: This is the first layer where the model takes in the data. In this case, the input size is 34, meaning there are 34 features being fed into the network.
# Dense layer: Every neuron in this layer is connected to all the neurons in the previous and next layers. This layer has 128 neurons.
# LeakyReLU activation: This activation function is similar to ReLU but allows small negative values to flow through, which helps prevent "dead neurons" (neurons that stop learning).
# BatchNormalization: This normalizes the inputs to the layer, which helps the network converge faster and more smoothly during training.
# Dropout: During training, 30% of the neurons are randomly "dropped" to prevent the model from overfitting. 
# In case of overfitting model become too specific to the training data and not generalize well.
            
model.add(Dense(128, input_shape=(X_train_scaled.shape[1],)))  # 34 input features
model.add(LeakyReLU(alpha=0.2))  # Activation function
model.add(BatchNormalization())  # Normalizes layer output for stable training
model.add(Dropout(0.3))  # Dropout 30% of neurons to prevent overfitting

# Add second hidden layer
# Dense layer with 64 neurons, followed by LeakyReLU, BatchNormalization, and Dropout
model.add(Dense(64))  # Fully connected layer with 64 neurons
model.add(LeakyReLU(alpha=0.2))  # Activation function
model.add(BatchNormalization())  # Normalization to speed up training
model.add(Dropout(0.3))  # Dropout 30% of neurons to avoid overfitting

# Add third hidden layer
# Dense layer with 32 neurons, LeakyReLU for activation, BatchNormalization, and Dropout
model.add(Dense(32))  # Fully connected layer with 32 neurons
model.add(LeakyReLU(alpha=0.2))  # Leaky ReLU activation function
model.add(BatchNormalization())  # Normalize the outputs of the layer
model.add(Dropout(0.2))  # Dropout 20% of neurons to avoid overfitting

# Add output layer for binary classification
# Output layer (Dense(1)): This layer has 1 neuron because it's a binary classification, meaning the output will be either 0 or 1
# Sigmoid activation: This activation function converts the output into a probability between 0 and 1, helping decide the final class (0 or 1)
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 neuron (for binary classification)

# Loss ('binary_crossentropy'): This function measures the difference between the predicted probabilities and the actual labels in binary classification, helping the model improve with each step.
# Optimizer ('adam'): Adam is an algorithm that adjusts the learning rate as training progresses, helping the model learn efficiently and quickly.
# Metrics ('accuracy'): Accuracy measures how often the model's predictions match the actual labels, giving an idea of how well the model is performing during training.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# ModelCheckpoint: This tool saves the model’s weights to know what it has learned during training.
# Best accuracy: It only saves the weights if the model reaches the highest accuracy seen so far, ensuring you keep the best version of the model.
checkpoint = ModelCheckpoint('best_model_weights.keras', monitor='accuracy', save_best_only=True, mode='max', verbose=1)

# Train the model
# fit() trains the model on the training data
# epochs=150: The model will go through the entire training data 150 times
# batch_size=16: Processes 16 samples at a time before updating the weights
# callbacks=[checkpoint]: Saves the best model weights during training
model.fit(X_train_scaled, y_train, epochs=150, batch_size=16, verbose=1, callbacks=[checkpoint])

# Load the best model weights for evaluation
# After training, the best-performing model weights are loaded
model.load_weights('best_model_weights.keras')

# Make predictions on the test set: Use the trained model to predict outcomes for the unseen test data.
# model.predict() outputs probabilities: The model gives a probability score between 0 and 1.
# Use > 0.5 to convert probabilities to binary labels (0 or 1): If the probability is greater than 0.5, label it as 1; otherwise, label it as 0.
predictions = (model.predict(X_test_scaled) > 0.5).astype("int32")

# Display evaluation metrics
# classification_report() prints precision, recall, and F1-score for both classes (0 and 1)
# accuracy_score() calculates the accuracy of the model on the test data
print(classification_report(y_test, predictions))  # Print detailed classification report
print("Accuracy:", accuracy_score(y_test, predictions))  # Print the overall accuracy

# Load external test dataset
# This dataset contains unseen test images, and we use it to make final predictions
external_test_df = pd.read_csv('images_for_testing.csv')

# Filenames are strings and perform natural sorting
# Sorting the external test dataset by filenames in a human-friendly order (e.g., image2, image10)
external_test_df['filename'] = external_test_df['filename'].astype(str)  # Ensure all filenames are treated as strings
sorted_test_df = external_test_df.sort_values(by='filename', key=lambda col: col.map(natural_sort_key))  # Sort by filenames

# Extract and scale the test features (assuming features start from the second column)
# We are excluding the filename column and scaling the features using the same scaler fitted earlier
external_test_features = sorted_test_df.iloc[:, 1:].values  # Extract features from the second column onward
external_test_scaled = scaler.transform(external_test_features)  # Scale the test data using the same scaler

# Predict on the external test set
# model.predict() gives probabilities, which we convert to binary labels (0 or 1)
external_test_predictions = (model.predict(external_test_scaled) > 0.5).astype("int32")  # Predict binary labels

# Prepare DataFrame for output
# Create a DataFrame with filenames and predicted classes for the external test set
output_df = pd.DataFrame({
    'filename': sorted_test_df['filename'].values,  # Use the sorted filenames
    'predicted_class': external_test_predictions.flatten()  # Flatten predictions to 1D
})

# Map the binary labels back to their original class names
# 1 -> 'Raveling' and 0 -> 'Non_raveling'
output_df['predicted_class'] = output_df['predicted_class'].map({1: 'Raveling', 0: 'Non_raveling'})  # Map 0/1 to class names

# Save predictions to a CSV file
# The final predictions are saved to 'final_output_predictions.csv'
output_file_path = 'final_output_predictions_vs_2.csv'
output_df.to_csv(output_file_path, index=False)  # Save the DataFrame to a CSV file without index

print(f"Predictions saved to {output_file_path}")  # Inform the user that the predictions have been saved


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/150
[1m33/35[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 8ms/step - accuracy: 0.7277 - loss: 0.5233
Epoch 1: accuracy improved from -inf to 0.75357, saving model to best_model_weights.keras
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.7299 - loss: 0.5211
Epoch 2/150
[1m31/35[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 6ms/step - accuracy: 0.8002 - loss: 0.4220
Epoch 2: accuracy improved from 0.75357 to 0.78393, saving model to best_model_weights.keras
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7982 - loss: 0.4246
Epoch 3/150
[1m31/35[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 7ms/step - accuracy: 0.8032 - loss: 0.4200
Epoch 3: accuracy improved from 0.78393 to 0.81071, saving model to best_model_weights.keras
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8040 - loss: 0.4182
Epoch 4/150
[1m34/35[0m [32m━━━━━━━━━━━━