In [2]:
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [3]:
DATASET_PATH = "/kaggle/input/ucf101-videos"

actions = sorted(os.listdir(DATASET_PATH))
NUM_CLASSES = len(actions)
label_map = {action: idx for idx, action in enumerate(actions)}

print("Number of action classes:", NUM_CLASSES)


Number of action classes: 4


In [4]:
IMG_SIZE = 224
FRAMES_PER_VIDEO = 10  # frames per video for LSTM

def extract_frames(video_path, num_frames=FRAMES_PER_VIDEO):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(total_frames // num_frames, 1)
    
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        frame = preprocess_input(frame)
        frames.append(frame)
    cap.release()
    
    # pad if fewer frames
    while len(frames) < num_frames:
        frames.append(frames[-1])
        
    return np.array(frames)


In [11]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.utils import to_categorical

# Parameters
MAX_VIDEOS_PER_CLASS = 20  # safe for Kaggle GPU
FRAMES_PER_VIDEO = 10
IMG_SIZE = 224
DATASET_PATH = "/kaggle/input/ucf101-videos"
VIDEOS_FOLDER = f"{DATASET_PATH}/train"  # videos inside train/
CSV_FILE = f"{DATASET_PATH}/train.csv"

# ----------------- Safe extract_frames -----------------
def extract_frames(video_path, num_frames=FRAMES_PER_VIDEO):
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    if not cap.isOpened():
        print("Cannot open video:", video_path)
        return np.zeros((num_frames, IMG_SIZE, IMG_SIZE, 3))
    
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames == 0:
        cap.release()
        print("Empty video:", video_path)
        return np.zeros((num_frames, IMG_SIZE, IMG_SIZE, 3))
    
    step = max(total_frames // num_frames, 1)
    
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        frame = preprocess_input(frame)
        frames.append(frame)
    
    cap.release()
    
    # pad frames if fewer than num_frames
    while len(frames) < num_frames:
        frames.append(frames[-1] if frames else np.zeros((IMG_SIZE, IMG_SIZE, 3)))
    
    return np.array(frames)

# ----------------- Load CSV -----------------
# Read CSV safely (some Kaggle CSVs may not have headers)
df = pd.read_csv(CSV_FILE, header=None, names=['video', 'label'])
print("First rows of CSV:\n", df.head())

# Limit videos per class
df = df.groupby('label').head(MAX_VIDEOS_PER_CLASS)

# Build label map
labels_unique = df['label'].unique()
label_map = {label: idx for idx, label in enumerate(labels_unique)}
NUM_CLASSES = len(label_map)

# ----------------- Build X, y -----------------
X, y = [], []

for idx, row in df.iterrows():
    vid_path = os.path.join(VIDEOS_FOLDER, row['video'])
    
    if not os.path.exists(vid_path):
        print("Missing video:", vid_path)
        frames = np.zeros((FRAMES_PER_VIDEO, IMG_SIZE, IMG_SIZE, 3))
    else:
        frames = extract_frames(vid_path, num_frames=FRAMES_PER_VIDEO)
    
    X.append(frames)
    y.append(label_map[row['label']])

X = np.array(X)  # shape: (num_videos, FRAMES_PER_VIDEO, 224,224,3)
y = to_categorical(y, NUM_CLASSES)

print("Dataset shape:", X.shape, y.shape)


First rows of CSV:
                        video        label
0                 video_name          tag
1  v_CricketShot_g08_c01.avi  CricketShot
2  v_CricketShot_g08_c02.avi  CricketShot
3  v_CricketShot_g08_c03.avi  CricketShot
4  v_CricketShot_g08_c04.avi  CricketShot
Missing video: /kaggle/input/ucf101-videos/train/video_name
Dataset shape: (101, 10, 224, 224, 3) (101, 6)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)


In [14]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Augmentation for frames
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.15,
    horizontal_flip=True,
    brightness_range=[0.8,1.2]
)

val_datagen = ImageDataGenerator()  # no augmentation for validation


In [15]:
def sequence_generator(X, y, batch_size=4, augmentor=None):
    n = len(X)
    while True:
        idx = np.random.permutation(n)
        for i in range(0, n, batch_size):
            batch_idx = idx[i:i+batch_size]
            batch_X = []
            for vid in X[batch_idx]:
                frames = []
                for frame in vid:
                    if augmentor:
                        frame = augmentor.random_transform(frame)
                    frames.append(frame)
                batch_X.append(np.array(frames))
            yield np.array(batch_X), y[batch_idx]

# Generators
train_gen = sequence_generator(X_train, y_train, augmentor=train_datagen)
val_gen = sequence_generator(X_val, y_val, augmentor=None)


In [16]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

cnn_base = MobileNetV2(
    weights='imagenet',
    include_top=False,
    pooling='avg',
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

# Freeze all except last 15 layers
for layer in cnn_base.layers[:-15]:
    layer.trainable = False
for layer in cnn_base.layers[-15:]:
    layer.trainable = True


I0000 00:00:1767544973.188284    2092 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [17]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.optimizers import Adam

inputs = Input(shape=(FRAMES_PER_VIDEO, IMG_SIZE, IMG_SIZE, 3))
x = TimeDistributed(cnn_base)(inputs)  # Apply CNN to each frame
x = LSTM(128)(x)
x = Dropout(0.5)(x)
outputs = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [18]:
steps_per_epoch = len(X_train) // 4
val_steps = len(X_val) // 4

history = model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    epochs=30,  # adjust as needed
    validation_data=val_gen,
    validation_steps=val_steps
)


Epoch 1/30


I0000 00:00:1767545034.893135    2661 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 1s/step - accuracy: 0.1607 - loss: 1.8670 - val_accuracy: 0.0833 - val_loss: 1.7599
Epoch 2/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 570ms/step - accuracy: 0.1062 - loss: 1.7789 - val_accuracy: 0.3333 - val_loss: 1.5421
Epoch 3/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 605ms/step - accuracy: 0.2336 - loss: 1.6227 - val_accuracy: 0.2500 - val_loss: 1.4610
Epoch 4/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 598ms/step - accuracy: 0.3820 - loss: 1.4054 - val_accuracy: 0.4167 - val_loss: 1.3139
Epoch 5/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 592ms/step - accuracy: 0.3614 - loss: 1.4304 - val_accuracy: 0.5833 - val_loss: 1.1971
Epoch 6/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 595ms/step - accuracy: 0.4638 - loss: 1.3018 - val_accuracy: 0.8333 - val_loss: 1.0840
Epoch 7/30
[1m17/17[0m [32m━━━━━━━

In [19]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step - accuracy: 0.9375 - loss: 0.2400
Test Accuracy: 0.9375


In [20]:
# Save the entire model to a single HDF5 file
MODEL_SAVE_PATH = "/kaggle/working/ucf101_mobilenet_lstm.h5"
model.save(MODEL_SAVE_PATH)

print("Model saved to:", MODEL_SAVE_PATH)




Model saved to: /kaggle/working/ucf101_mobilenet_lstm.h5


In [21]:
from tensorflow.keras.models import load_model

# Load HDF5 model
MODEL_PATH = "/kaggle/working/ucf101_mobilenet_lstm.h5"
model = load_model(MODEL_PATH)
print("Model loaded successfully!")




Model loaded successfully!


In [25]:
video_path = "/kaggle/input/ucf101-videos/test/v_Punch_g02_c01.avi"

# Extract frames
frames = extract_frames(video_path, num_frames=FRAMES_PER_VIDEO)

# Add batch dimension
frames = np.expand_dims(frames, axis=0)  # shape: (1, FRAMES_PER_VIDEO, 224,224,3)

# Predict
pred = model.predict(frames)
pred_class_idx = np.argmax(pred, axis=1)[0]
pred_label = list(label_map.keys())[list(label_map.values()).index(pred_class_idx)]

print("Predicted action:", pred_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Predicted action: Punch
