### Summary #####
 1. In this code , a video classification model has been created on UCF101 dataset. It has total 101 classes.
 2. Due to resource Constraint, For initial stage model has been trained on 10 classes using ResNet50 model.
 3. We get an "pretrained_weights_c10.h5" file in the first stage of training
 4. Now we want to add 5 more classes to these pretrained weights. A new dataset has been added named "Custom_dataset_New_5_classes"
 5. We load the pre-trained model trained on the existing 10 classes.
 6. We remove the original output layer of the pre-trained model since it only corresponds to the 10 classes.
 7. We freeze the layers of the pre-trained model to retain their weights.
 8. We add a new output layer with 15 units to accommodate the additional 5 classes.
 9. We train the modified model only on the new dataset containing the 5 new classes.
 10.Finally, we save the model, which now recognizes all classes.



In [2]:
# Importing Important Libraries

import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import glorot_uniform

In [3]:
os.chdir(r'/content/drive/MyDrive/Proglint_Assessment_2')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Function to extract frames from videos

def extract_frames(video_path, num_frames=16, resize=(224, 224)):
    frames = []
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=np.int16)

    for i in range(total_frames):
        ret, frame = cap.read()
        if ret is False:
            break
        if i in frame_indices:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, resize)
            frames.append(frame)

    cap.release()
    return frames

##Loading the dataset ##
Size of this dataset is 2 GB

In [19]:
# Path to the dataset directory

dataset_dir='/content/drive/MyDrive/Proglint_Assessment/New_dataset_10_classes'

# List to store frames and labels

frames = []
labels = []

In [20]:
# Loop through each class directory

for class_name in os.listdir(dataset_dir):
    class_dir = os.path.join(dataset_dir, class_name)

    # Loop through each video in the class directory

    for video_name in os.listdir(class_dir):
        video_path = os.path.join(class_dir, video_name)
        extracted_frames = extract_frames(video_path)
        frames.extend(extracted_frames)
        labels.extend([class_name] * len(extracted_frames))

# Convert frames and labels to numpy arrays
frames = np.array(frames)
labels = np.array(labels)

In [21]:
# Perform one-hot encoding on the labels
label_binarizer = LabelBinarizer()
labels_encoded = label_binarizer.fit_transform(labels)


In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(frames, labels_encoded, test_size=0.1, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5753, 224, 224, 3)
(5753, 10)
(1439, 224, 224, 3)
(1439, 10)


In [23]:
# Load pre-trained ResNet50 model

from tensorflow.keras import regularizers
base_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))

# Build the model using regularization L2 Method

model = Sequential([
    base_model,
    Flatten(),
    Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),  # L2 regularization
    Dropout(0.5),
    Dense(10, activation='softmax')  # Number of classes
])
# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])


## Initial Training on 10 classes##

> Got our first pretrained weights file from here



In [24]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

# Train the model with callbacks
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, callbacks=[early_stopping, reduce_lr])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')
# Save the weights
model.save_weights('pretrained_weights_c10_new.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Test Loss: 0.0333438441157341
Test Accuracy: 1.0


In [29]:
# We will Load the saved weights now obtained after training

model.load_weights('pretrained_weights_c10_new.h5')

# Get the classes from the label binarizer
classes = label_binarizer.classes_

# # Print the classes
print("Classes in the dataset:")
for i, class_name in enumerate(classes):     # There are intially 10 classes whose information is giving below
    print(f"{i}: {class_name}")

Classes in the dataset:
0: ApplyLipstick
1: Archery
2: BabyCrawling
3: BalanceBeam
4: BandMarching
5: BaseballPitch
6: Basketball
7: BasketballDunk
8: BenchPress
9: Biking


In [30]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict classes for the test data
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
class_names = ["class_0", "class_1", "class_2", "class_3", "class_4", "class_5", "class_6", "class_7", "class_8", "class_9"]
report = classification_report(y_true_classes, y_pred_classes, target_names=class_names)

print("\nClassification Report:")
print(report)


Confusion Matrix:
[[82  0  0  0  0  0  0  0  0  0]
 [ 0 64  0  0  0  0  0  0  0  0]
 [ 0  0 69  0  0  0  0  0  0  0]
 [ 0  0  0 64  0  0  0  0  0  0]
 [ 0  0  0  0 68  0  0  0  0  0]
 [ 0  0  0  0  0 79  0  0  0  0]
 [ 0  0  0  0  0  0 68  0  0  0]
 [ 0  0  0  0  0  0  0 88  0  0]
 [ 0  0  0  0  0  0  0  0 73  0]
 [ 0  0  0  0  0  0  0  0  0 65]]

Classification Report:
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        82
     class_1       1.00      1.00      1.00        64
     class_2       1.00      1.00      1.00        69
     class_3       1.00      1.00      1.00        64
     class_4       1.00      1.00      1.00        68
     class_5       1.00      1.00      1.00        79
     class_6       1.00      1.00      1.00        68
     class_7       1.00      1.00      1.00        88
     class_8       1.00      1.00      1.00        73
     class_9       1.00      1.00      1.00        65

    accuracy                   

 # Inferencing to the unseen video (Total 4 results presented)#

In [15]:
# First Inferencing

unseen_video_path='/content/v_Basketball_g01_c01.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")


The predicted class for the unseen video is: class_6


In [31]:
#Second Inferencing

unseen_video_path='/content/v_BabyCrawling_g01_c03.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")



The predicted class for the unseen video is: class_2


In [32]:
# Third Inferencing

unseen_video_path='/content/v_ApplyLipstick_g01_c01.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")



The predicted class for the unseen video is: class_0


In [33]:
# Fourth Inferencing

unseen_video_path='/content/v_Biking_g01_c01.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")



The predicted class for the unseen video is: class_9


#Train custom model with new  5 classes using pretrained weights of previous trained model
#Dataset Size= 1Gb#


In [None]:
# we will now freeze the layers of the pre-trained model that were trained on the initial 10 classes,
# and then add new layers to handle the new classes.
# add new layers to handle the new classes, and then train the entire model on the combined dataset (initial 10 classes + new 5 classes).
# This way, the model retains the knowledge learned from the initial classes while also adapting to the new classes.

In [None]:
# Here I am Loading  the saved weights obtained after training the model on 10 classes previously

model.load_weights('pretrained_weights_c10_new.h5')  # Load the weights of previously trained model on 10 classes

In [None]:
# Remove the original output layer to replace it with our new classes
model.layers.pop()

# Freezing the layers of the pretrained model. This will freeze the already trained model on 10 classes .

for layer in model.layers:
    layer.trainable = False

In [22]:
# Function to extract frames from videos
def extract_frames(video_path, num_frames=16, resize=(224, 224)):
    frames = []
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=np.int16)

    for i in range(total_frames):
        ret, frame = cap.read()
        if ret is False:
            break
        if i in frame_indices:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, resize)
            frames.append(frame)

    cap.release()
    return frames

In [24]:
# Path to the dataset directory
new_dataset_dir = '/content/drive/MyDrive/Proglint_Assessment/Custom_dataset_New_5_classes'

# List to store frames and labels
frames = []
labels = []

In [25]:
# Loop through each class directory
for class_name in os.listdir(new_dataset_dir):
    class_dir = os.path.join(new_dataset_dir, class_name)
    # Loop through each video in the class directory
    for video_name in os.listdir(class_dir):
        video_path = os.path.join(class_dir, video_name)
        extracted_frames = extract_frames(video_path)
        frames.extend(extracted_frames)
        labels.extend([class_name] * len(extracted_frames))

# Convert frames and labels to numpy arrays
frames = np.array(frames)
labels = np.array(labels)
# Perform one-hot encoding on the labels
label_binarizer = LabelBinarizer()
labels_encoded = label_binarizer.fit_transform(labels)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(frames, labels_encoded, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(958, 224, 224, 3)
(958, 5)
(240, 224, 224, 3)
(240, 5)


## Again training of the 1GB dataset only by freezing the previous classes and including new 5 classes  ##

Here we will add a new dense layer of 5 classes and concatenate it with previous layer to get all 15 classes

In [None]:
# Add new Dense layer for the new classes
new_output = Dense(5, activation='softmax')(model.layers[-1].output)  # Assuming the last layer is Dense

# Concatenate original output with new output
new_output_concatenated = Dense(15, activation='softmax')(model.layers[-2].output)

# Create the new model
model = Model(inputs=model.input, outputs=new_output_concatenated)

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [31]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

# Train the model with callbacks
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32, callbacks=[early_stopping, reduce_lr])

# history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=7, batch_size=32)
# #

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')
# Save the weights
model.save_weights('pretrained_weights_c15_new.h5')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.2214791178703308
Test Accuracy: 1.0


## Here Loading the final weights file

The output layer of previously trained model will concatinated with the existing weights


In [37]:
# # Load the finally saved weights obtained after getting the new weight file

model.load_weights('pretrained_weights_c15_new.h5')

# # Get the classes from the label binarizer
classes = label_binarizer.classes_
#
# # # Print the classes
print("Classes in the dataset:")
for i, class_name in enumerate(classes):     # Now we will get all the 10 previous classes and 5 more new classes
    print(f"{i}: {class_name}")

Classes in the dataset:
0: ApplyLipstick
1: Archery
2: BabyCrawling
3: BalanceBeam
4: BandMarching
5: BaseballPitch
6: Basketball
7: BasketballDunk
8: BenchPress
9: Biking
10: TennisSwing
11: ThrowDiscus
12: TrampolineJumping
13: Typing
14: UnevenBars


# Inferencing on all 15 classes using new weights ###
# 5 example shown ##

In [45]:
# First Inferencing

unseen_video_path='/content/v_Archery_g01_c02.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11',12: 'class_12',13: 'class_13',14: 'class_14',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")


The predicted class for the unseen video is: class_1


In [46]:
# Second Inferencing

unseen_video_path='/content/v_BalanceBeam_g04_c02.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11',12: 'class_12',13: 'class_13',14: 'class_14',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")


The predicted class for the unseen video is: class_3


In [47]:
# Third Inferencing

unseen_video_path='/content/v_Typing_g01_c02.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11',12: 'class_12',13: 'class_13',14: 'class_14',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")


The predicted class for the unseen video is: class_13


In [48]:
# Fourth Inferencing

unseen_video_path='/content/v_UnevenBars_g04_c03.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11',12: 'class_12',13: 'class_13',14: 'class_14',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")


The predicted class for the unseen video is: class_14


In [49]:
# Fifth Inferencing

unseen_video_path='/content/v_TennisSwing_g01_c02.avi'
# Extract frames from the unseen video
unseen_frames = extract_frames(unseen_video_path)

# Convert frames to numpy array and preprocess
unseen_frames = np.array(unseen_frames) / 255.0  # Normalize pixel values

# Make predictions
predictions = model.predict(unseen_frames)

# Aggregate predictions across frames
final_prediction = np.argmax(np.sum(predictions, axis=0))

# # Map prediction index to class label
label_mapping = {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6',7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11',12: 'class_12',13: 'class_13',14: 'class_14',}
predicted_class = label_mapping[final_prediction]

print(f"The predicted class for the unseen video is: {predicted_class}")


The predicted class for the unseen video is: class_10
