In [None]:
# Imported libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')


# Dataset (kagglehub)
import kagglehub
path = kagglehub.dataset_download("PromptCloudHQ/imdb-data")
print("Path to dataset files:", path)
files = os.listdir(path)
print("Files in dataset directory:", files)
csv_file_path = os.path.join(path, 'IMDB-Movie-Data.csv')  # Replace with the actual filename
data = pd.read_csv(csv_file_path)
print(data.columns)
print(data['Title'][1])
print(data['Genre'][1])
print(data['Description'][1])

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
print(stop_words)

def preprocess_text(text):
    tokens = text.lower().split()
    print("Original tokens:", tokens)

    filtered_tokens = [token for token in tokens if token not in stop_words and token.isalnum()]
    print("Filtered tokens:", filtered_tokens)

    lemmatized = [lemmatizer.lemmatize(token, pos='n') for token in filtered_tokens]
    return ' '.join(lemmatized)

data['clean_desc'] = data['Description'].apply(preprocess_text)

# Multi-label Labelling
from sklearn.preprocessing import MultiLabelBinarizer
genres = data['Genre'].apply(lambda x: x.split(','))
print(genres)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(genres)
print(y[0])
print(mlb.classes_)

# Tokenize Descriptions
text_description = data['clean_desc']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_description)
sequences = tokenizer.texts_to_sequences(text_description)
word_index = tokenizer.word_index
X = pad_sequences(sequences, maxlen=100)

# Download Glove Embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
print('Found %s word vectors.' % len(embedding_index))

embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

# Train/Validation/Test Split
X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=700, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=200, random_state=42)

# RNN Model
print('\n----- RNN Model -----')
print('The Recurrent Neural Network (RNN) model uses a SimpleRNN layer to analyze movie descriptions '
      'based on temporal and sequential data.')
rnn_model = Sequential([
    Embedding(input_dim=len(word_index)+1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False),
    SimpleRNN(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(20, activation='sigmoid')
])
rnn_model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
rnn_history = rnn_model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val))

# LSTM Model
print('\n----- LSTM Model -----')
print('The Long Short-Term Memory (LSTM) model uses an LSTM layer to analyze movie descriptions '
      'based on feedback connections and memory cells.')
lstm_model = Sequential([
    Embedding(input_dim=len(word_index)+1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(20, activation='sigmoid')
])
lstm_model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
lstm_history = lstm_model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val))

# Evaluation on Test Set
print("RNN Test Performance:", rnn_model.evaluate(X_test, y_test))
print("LSTM Test Performance:", lstm_model.evaluate(X_test, y_test))

# Accuracy, Precision, Recall for RNN and LSTM
def evaluate_multilabel_model(model, X_test, y_test, model_name="Model"):
    y_pred_prob = model.predict(X_test, verbose=0)
    y_pred = (y_pred_prob >= 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='micro', zero_division=0)

    print(f"\n{model_name} Evaluation Metrics:")
    print(f"Accuracy:  {accuracy:.10f}")
    print(f"Precision: {precision:.10f}")
    print(f"Recall:    {recall:.10f}")

evaluate_multilabel_model(rnn_model, X_test, y_test, model_name="RNN")
evaluate_multilabel_model(lstm_model, X_test, y_test, model_name="LSTM")

# Plot Training History
def plot_history(history, title):
    plt.figure(figsize=(12, 5))
    epoch1 = range(1, len(history.history['accuracy']) + 1)
    epoch2 = range(1, len(history.history['loss']) + 1)
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    plt.subplot(1, 2, 1)
    plt.plot(epoch1, acc, label='Train Accuracy')
    plt.plot(epoch1, val_acc, label='Validation Accuracy')
    plt.title(f'{title} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.xticks(epoch1)
    plt.grid(True)
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epoch2, loss, label='Train Loss')
    plt.plot(epoch2, val_loss, label='Validation Loss')
    plt.title(f'{title} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.xticks(epoch2)
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_history(rnn_history, "RNN")
plot_history(lstm_history, "LSTM")

# Summary of Part 1 in Assignment 3 - Deep Learning Applications

# For this project, my goal was to implement two deep learning algorithms (RNN and LSTM) to sort out
# IMDB movies from Kaggle into different genres based on their short descriptions. After the text was preprocessed
# and lemmatized, I used Glove embeddings to tokenize each description. The genre labels were multi-labelled and
# binarized using a MultiLabelBinarizer. Two models - the RNN and LSTM - were built using a SimpleRNN and a LSTM layer
# along with embedding, dense, and dropout layers. These models were trained for 20 epochs each and evaluated by finding
# accuracy, precision, and recall. The RNN had slightly better accuracy and recall, while the LSTM had slightly better precision.
# Both of these models ended up struggling with accuracy, but they were both successful in demonstrating Multi-Label labelling.

In [None]:
# Imported libraries
import kagglehub
import os
path = kagglehub.dataset_download("puneet6060/intel-image-classification")
print("Path to dataset files:", path)

import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_directory = os.path.join(path, 'seg_train/seg_train')
test_directory = os.path.join(path, 'seg_test/seg_test')

# Image Generators
data = ImageDataGenerator(rescale=1./255)
train_generator = data.flow_from_directory(train_directory, target_size=(150,150), batch_size=128, class_mode='categorical')
test_generator = data.flow_from_directory(test_directory, target_size=(150,150), batch_size=128, class_mode='categorical')

# Model 1 - 3 Conv Layers, 3 Max Pooling Layers
cnn_pt1 = Sequential([
    Input(shape=(150,150,3)),
    Conv2D(32, (3,3), activation='relu'),
    MaxPooling2D(),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(),
    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(6, activation='softmax')
])
print('\n----- Model 1 -----')
cnn_pt1.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
cnn_pt1.fit(train_generator, validation_data=test_generator, epochs=5)

# Model 2 - 6 Conv Layers, 3 Max Pooling Layers
cnn_pt2 = Sequential([
    Input(shape=(150,150,3)),
    Conv2D(32, (3,3), activation='relu'),
    Conv2D(32, (3,3), activation='relu'),
    MaxPooling2D(),
    Conv2D(64, (3,3), activation='relu'),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(),
    Conv2D(128, (3,3), activation='relu'),
    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(6, activation='softmax')
])
print('\n----- Model 2 -----')
cnn_pt2.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
cnn_pt2.fit(train_generator, validation_data=test_generator, epochs=5)

# Evaluated Accuracy
print("Model 1 Accuracy:", cnn_pt1.evaluate(test_generator))
print("Model 2 Accuracy:", cnn_pt2.evaluate(test_generator))

# Per-Class Classification Report
def evaluate_and_report(model, generator, model_name):
    y_true = []
    y_pred = []
    class_names = list(test_generator.class_indices.keys())
    for i in range(len(generator)):
        x_batch, y_batch = generator[i]
        pred = model.predict(x_batch, verbose=0)
        y_true.extend(np.argmax(y_batch, axis=1))
        y_pred.extend(np.argmax(pred, axis=1))

    report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)
    df_report = pd.DataFrame(report).transpose()
    print(f"\n--- {model_name} Classification Report ---")
    print(df_report)

# Evaluate and Report Models
evaluate_and_report(cnn_pt1, test_generator, "Model 1")
evaluate_and_report(cnn_pt2, test_generator, "Model 2")

# Visualize Predictions
x_test, y_test = next(test_generator)
pred1 = cnn_pt1.predict(x_test[:2], verbose=0)
pred2 = cnn_pt2.predict(x_test[:2], verbose=0)
class_names = list(test_generator.class_indices.keys())
for i in range(2):
    true_label = class_names[np.argmax(y_test[i])]
    pred_label1 = class_names[np.argmax(pred1[i])]
    pred_label2 = class_names[np.argmax(pred2[i])]
    plt.figure(figsize=(8,4))
    plt.imshow(x_test[i])
    plt.title(f"Actual: {true_label}\nModel 1 Prediction: {pred_label1} | Model 2 Prediction: {pred_label2}", fontsize=12)
    plt.tight_layout()
    plt.axis('off')
    plt.show()

# Summary of Part 2 in Assignment 3 - Deep Learning Applications

# For this project, my goal was to implement a Convolutional Neural Network (CNN) to sort out natural images from the
# Intel Kagglehub dataset into 6 classes. After these images were generated based on training and testing (validation),
# two CNN models were built. The first CNN model was implemented using 3 Conv layers and 3 Max Pooling layers, while the second
# model was implemented using 6 Conv layers and 3 Max Pooling layers. Both models included dropout and dense layers before adding
# an output layer. These models were trained for 5 epochs each with rescaled image data. Model 2 demonstrated a stronger performance
# than Model 1 in correctly classifying images due to its increased use of Conv layers.