In [2]:
# ============================================================
# HUMAN ACTION RECOGNITION – VIDEO DATASET (UCF-style)
# Videos -> MediaPipe Pose -> 30-frame Sequences -> CNN + BiLSTM
# Modes: build -> train -> realtime
# Author: Alok Rathour
# ============================================================

# -------------------- MODE --------------------
# "build"    : Read videos using train.csv -> create har_video_dataset.npz
# "train"    : Train CNN + BiLSTM on pose sequences
# "realtime" : Run live webcam detection
MODE = "build"   # change to: build / train / realtime

# -------------------- PATHS --------------------
ROOT = "."                # CDAC Project folder
TRAIN_CSV = "train.csv"   # has: clip_name, clip_path, label
DATA_PATH = "har_video_dataset.npz"

SEQUENCE_LENGTH = 30
MAX_SEQS_PER_VIDEO = 10    # limit per video to keep dataset size manageable
EPOCHS = 30
BATCH_SIZE = 8

# -------------------- IMPORTS --------------------
import os
import cv2
import numpy as np
import pandas as pd
from collections import Counter, deque

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import mediapipe as mp

# -------------------- BASIC SETUP --------------------
NUM_JOINTS = 33
CHECKPOINT_PATH = "har_video_model.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", DEVICE)

# -------------------- POSE (MEDIAPIPE) --------------------
mp_pose = mp.solutions.pose
pose_detector = mp_pose.Pose(static_image_mode=False, model_complexity=1)

def extract_pose(frame):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = pose_detector.process(rgb)
    pts = []
    if res.pose_landmarks:
        for lm in res.pose_landmarks.landmark:
            pts.append([lm.x, lm.y])
    else:
        for _ in range(NUM_JOINTS):
            pts.append([0.0, 0.0])
    return np.array(pts, dtype=np.float32)

# -------------------- DATASET CLASS --------------------
class HARDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        x = torch.tensor(self.X[i], dtype=torch.float32).unsqueeze(1)
        y = torch.tensor(self.y[i], dtype=torch.long)
        return x, y

def load_npz(path):
    d = np.load(path, allow_pickle=True)
    return list(d["X"]), list(d["y"]), list(d["class_names"])

# -------------------- MODEL (STABLE) --------------------
class CNN_BiLSTM(nn.Module):
    def __init__(self, ncls):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1))
        )
        self.feature = 32 * (NUM_JOINTS//4) * 2
        self.lstm = nn.LSTM(self.feature, 64, num_layers=1, batch_first=True, bidirectional=True)
        self.head = nn.Sequential(
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.5), nn.Linear(64, ncls)
        )
    def forward(self, x):
        B,T,C,H,W = x.shape
        x = x.view(B*T, C, H, W)
        x = self.cnn(x)
        x = x.view(B, T, -1)
        o,_ = self.lstm(x)
        return self.head(o[:,-1,:])

# -------------------- BUILD FROM VIDEOS --------------------
def build_from_videos():
    df = pd.read_csv(TRAIN_CSV)
    df["label"] = df["label"].astype(str).str.strip()

    class_names = sorted(df["label"].unique())
    label_map = {c:i for i,c in enumerate(class_names)}

    print("Detected classes:", class_names)
    print("Total classes:", len(class_names))

    X, y = [], []

    for _, row in df.iterrows():
        rel_path = row["clip_path"].lstrip("/")
        video_path = os.path.join(ROOT, rel_path)
        label = label_map[row["label"]]

        if not os.path.exists(video_path):
            continue

        cap = cv2.VideoCapture(video_path)
        poses = []
        while True:
            ok, fr = cap.read()
            if not ok:
                break
            poses.append(extract_pose(fr))
        cap.release()

        seq_count = 0
        for i in range(0, len(poses) - SEQUENCE_LENGTH + 1, SEQUENCE_LENGTH):
            X.append(np.array(poses[i:i+SEQUENCE_LENGTH]))
            y.append(label)
            seq_count += 1
            if seq_count >= MAX_SEQS_PER_VIDEO:
                break

    X = np.array(X)
    y = np.array(y)

    np.savez(DATA_PATH, X=X, y=y, class_names=np.array(class_names))
    print("Dataset created:", DATA_PATH)
    print("X:", X.shape, "y:", y.shape)

# -------------------- TRAINING HELPERS --------------------
def accuracy(p, y):
    return (p.argmax(1) == y).float().mean().item()

def balanced_loader(ds, bs):
    cnt = Counter(ds.y)
    print("Class Dist:", cnt)
    w = {c:1.0/cnt[c] for c in cnt}
    sw = [w[l] for l in ds.y]
    sampler = WeightedRandomSampler(sw, len(sw), replacement=True)
    return DataLoader(ds, batch_size=bs, sampler=sampler)

# -------------------- RUN --------------------
if MODE == "build":
    build_from_videos()

elif MODE == "train":
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(DATA_PATH)

    X, y, class_names = load_npz(DATA_PATH)
    NUM_CLASSES = len(class_names)

    idx = np.random.permutation(len(X))
    split = int(0.8 * len(X))
    tr_idx, va_idx = idx[:split], idx[split:]

    tr = HARDataset([X[i] for i in tr_idx], [y[i] for i in tr_idx])
    va = HARDataset([X[i] for i in va_idx], [y[i] for i in va_idx])

    model = CNN_BiLSTM(NUM_CLASSES).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    criterion = nn.CrossEntropyLoss()

    tr_ld = balanced_loader(tr, BATCH_SIZE)
    va_ld = DataLoader(va, BATCH_SIZE, shuffle=False)

    best = 0.0
    for e in range(EPOCHS):
        model.train(); tl=ta=0
        for xb, yb in tr_ld:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward(); optimizer.step()
            tl += loss.item(); ta += accuracy(out, yb)
        tl/=len(tr_ld); ta/=len(tr_ld)

        model.eval(); vl=va=0
        with torch.no_grad():
            for xb, yb in va_ld:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                out = model(xb)
                vl += criterion(out, yb).item()
                va += accuracy(out, yb)
        vl/=len(va_ld); va/=len(va_ld)

        print(f"E{e+1} TL{tl:.3f} TA{ta:.3f} | VL{vl:.3f} VA{va:.3f}")
        if va > best:
            best = va
            torch.save({"model": model.state_dict(), "classes": class_names}, CHECKPOINT_PATH)
            print("Saved best")

elif MODE == "realtime":
    ck = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    class_names = ck["classes"]
    NUM_CLASSES = len(class_names)

    model = CNN_BiLSTM(NUM_CLASSES).to(DEVICE)
    model.load_state_dict(ck["model"]); model.eval()

    cap = cv2.VideoCapture(0)
    buf = deque(maxlen=SEQUENCE_LENGTH)

    while True:
        ok, fr = cap.read()
        if not ok: break
        buf.append(extract_pose(fr))

        txt = "Collecting"
        if len(buf) == SEQUENCE_LENGTH:
            x = torch.tensor(np.array(buf), dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(DEVICE)
            with torch.no_grad():
                out = model(x)
                p = torch.softmax(out, 1)
                c, i = p.max(1)
            txt = f"{class_names[int(i.item())]} ({float(c.item()):.2f})"

        cv2.putText(fr, txt, (20,40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
        cv2.imshow("HAR", fr)
        if cv2.waitKey(1) & 0xFF == ord('q'): break

    cap.release(); cv2.destroyAllWindows()


Running on: cuda
Detected classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking', 'Billiards', 'BlowDryHair', 'BlowingCandles', 'BodyWeightSquats', 'Bowling', 'BoxingPunchingBag', 'BoxingSpeedBag', 'BreastStroke', 'BrushingTeeth', 'CleanAndJerk', 'CliffDiving', 'CricketBowling', 'CricketShot', 'CuttingInKitchen', 'Diving', 'Drumming', 'Fencing', 'FieldHockeyPenalty', 'FloorGymnastics', 'FrisbeeCatch', 'FrontCrawl', 'GolfSwing', 'Haircut', 'HammerThrow', 'Hammering', 'HandstandPushups', 'HandstandWalking', 'HeadMassage', 'HighJump', 'HorseRace', 'HorseRiding', 'HulaHoop', 'IceDancing', 'JavelinThrow', 'JugglingBalls', 'JumpRope', 'JumpingJack', 'Kayaking', 'Knitting', 'LongJump', 'Lunges', 'MilitaryParade', 'Mixing', 'MoppingFloor', 'Nunchucks', 'ParallelBars', 'PizzaTossing', 'PlayingCello', 'PlayingDaf', 'PlayingDhol', 'PlayingFlute', 'PlayingGuitar', 'PlayingPiano',



Dataset created: har_video_dataset.npz
X: (54991, 30, 33, 2) y: (54991,)


In [2]:
# ============================================================
# HUMAN ACTION RECOGNITION – VIDEO DATASET (UCF-style)
# Videos -> MediaPipe Pose -> 30-frame Sequences -> CNN + BiLSTM
# Modes: build -> train -> realtime
# Author: Alok Rathour
# ============================================================

# -------------------- MODE --------------------
# "build"    : Read videos using train.csv -> create har_video_dataset.npz
# "train"    : Train CNN + BiLSTM on pose sequences
# "realtime" : Run live webcam detection
MODE = "train"   # change to: build / train / realtime

# -------------------- PATHS --------------------
ROOT = "."                # CDAC Project folder
TRAIN_CSV = "train.csv"   # has: clip_name, clip_path, label
DATA_PATH = "har_video_dataset.npz"

SEQUENCE_LENGTH = 30
MAX_SEQS_PER_VIDEO = 10    # limit per video to keep dataset size manageable
EPOCHS = 30
BATCH_SIZE = 8

# -------------------- IMPORTS --------------------
import os
import cv2
import numpy as np
import pandas as pd
from collections import Counter, deque

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import mediapipe as mp

# -------------------- BASIC SETUP --------------------
NUM_JOINTS = 33
CHECKPOINT_PATH = "har_video_model.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", DEVICE)

# -------------------- POSE (MEDIAPIPE) --------------------
mp_pose = mp.solutions.pose
pose_detector = mp_pose.Pose(static_image_mode=False, model_complexity=1)

def extract_pose(frame):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = pose_detector.process(rgb)
    pts = []
    if res.pose_landmarks:
        for lm in res.pose_landmarks.landmark:
            pts.append([lm.x, lm.y])
    else:
        for _ in range(NUM_JOINTS):
            pts.append([0.0, 0.0])
    return np.array(pts, dtype=np.float32)

# -------------------- DATASET CLASS --------------------
class HARDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        x = torch.tensor(self.X[i], dtype=torch.float32).unsqueeze(1)
        y = torch.tensor(self.y[i], dtype=torch.long)
        return x, y

def load_npz(path):
    d = np.load(path, allow_pickle=True)
    return list(d["X"]), list(d["y"]), list(d["class_names"])

# -------------------- MODEL (STABLE) --------------------
class CNN_BiLSTM(nn.Module):
    def __init__(self, ncls):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1))
        )
        self.feature = 32 * (NUM_JOINTS//4) * 2
        self.lstm = nn.LSTM(self.feature, 64, num_layers=1, batch_first=True, bidirectional=True)
        self.head = nn.Sequential(
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.5), nn.Linear(64, ncls)
        )
    def forward(self, x):
        B,T,C,H,W = x.shape
        x = x.view(B*T, C, H, W)
        x = self.cnn(x)
        x = x.view(B, T, -1)
        o,_ = self.lstm(x)
        return self.head(o[:,-1,:])

# -------------------- BUILD FROM VIDEOS --------------------
def build_from_videos():
    df = pd.read_csv(TRAIN_CSV)
    df["label"] = df["label"].astype(str).str.strip()

    class_names = sorted(df["label"].unique())
    label_map = {c:i for i,c in enumerate(class_names)}

    print("Detected classes:", class_names)
    print("Total classes:", len(class_names))

    X, y = [], []

    for _, row in df.iterrows():
        rel_path = row["clip_path"].lstrip("/")
        video_path = os.path.join(ROOT, rel_path)
        label = label_map[row["label"]]

        if not os.path.exists(video_path):
            continue

        cap = cv2.VideoCapture(video_path)
        poses = []
        while True:
            ok, fr = cap.read()
            if not ok:
                break
            poses.append(extract_pose(fr))
        cap.release()

        seq_count = 0
        for i in range(0, len(poses) - SEQUENCE_LENGTH + 1, SEQUENCE_LENGTH):
            X.append(np.array(poses[i:i+SEQUENCE_LENGTH]))
            y.append(label)
            seq_count += 1
            if seq_count >= MAX_SEQS_PER_VIDEO:
                break

    X = np.array(X)
    y = np.array(y)

    np.savez(DATA_PATH, X=X, y=y, class_names=np.array(class_names))
    print("Dataset created:", DATA_PATH)
    print("X:", X.shape, "y:", y.shape)

# -------------------- TRAINING HELPERS --------------------
def accuracy(p, y):
    return (p.argmax(1) == y).float().mean().item()

def balanced_loader(ds, bs):
    cnt = Counter(ds.y)
    print("Class Dist:", cnt)
    w = {c:1.0/cnt[c] for c in cnt}
    sw = [w[l] for l in ds.y]
    sampler = WeightedRandomSampler(sw, len(sw), replacement=True)
    return DataLoader(ds, batch_size=bs, sampler=sampler)

# -------------------- RUN --------------------
if MODE == "build":
    build_from_videos()

elif MODE == "train":
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(DATA_PATH)

    X, y, class_names = load_npz(DATA_PATH)
    NUM_CLASSES = len(class_names)

    idx = np.random.permutation(len(X))
    split = int(0.8 * len(X))
    tr_idx, va_idx = idx[:split], idx[split:]

    tr = HARDataset([X[i] for i in tr_idx], [y[i] for i in tr_idx])
    va = HARDataset([X[i] for i in va_idx], [y[i] for i in va_idx])

    model = CNN_BiLSTM(NUM_CLASSES).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    criterion = nn.CrossEntropyLoss()

    tr_ld = balanced_loader(tr, BATCH_SIZE)
    va_ld = DataLoader(va, BATCH_SIZE, shuffle=False)

    best = 0.0
    for e in range(EPOCHS):
        model.train(); tl=ta=0
        for xb, yb in tr_ld:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward(); optimizer.step()
            tl += loss.item(); ta += accuracy(out, yb)
        tl/=len(tr_ld); ta/=len(tr_ld)

        model.eval(); vl=va=0
        with torch.no_grad():
            for xb, yb in va_ld:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                out = model(xb)
                vl += criterion(out, yb).item()
                va += accuracy(out, yb)
        vl/=len(va_ld); va/=len(va_ld)

        print(f"E{e+1} TL{tl:.3f} TA{ta:.3f} | VL{vl:.3f} VA{va:.3f}")
        if va > best:
            best = va
            torch.save({"model": model.state_dict(), "classes": class_names}, CHECKPOINT_PATH)
            print("Saved best")

elif MODE == "realtime":
    ck = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    class_names = ck["classes"]
    NUM_CLASSES = len(class_names)

    model = CNN_BiLSTM(NUM_CLASSES).to(DEVICE)
    model.load_state_dict(ck["model"]); model.eval()

    cap = cv2.VideoCapture(0)
    buf = deque(maxlen=SEQUENCE_LENGTH)

    while True:
        ok, fr = cap.read()
        if not ok: break
        buf.append(extract_pose(fr))

        txt = "Collecting"
        if len(buf) == SEQUENCE_LENGTH:
            x = torch.tensor(np.array(buf), dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(DEVICE)
            with torch.no_grad():
                out = model(x)
                p = torch.softmax(out, 1)
                c, i = p.max(1)
            txt = f"{class_names[int(i.item())]} ({float(c.item()):.2f})"

        cv2.putText(fr, txt, (20,40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
        cv2.imshow("HAR", fr)
        if cv2.waitKey(1) & 0xFF == ord('q'): break

    cap.release(); cv2.destroyAllWindows()

Running on: cuda
E1 TL3.478 TA0.132 | VL16.473 VA0.000
E2 TL2.885 TA0.241 | VL19.984 VA0.000
Saved best
E3 TL2.550 TA0.325 | VL23.386 VA0.002
Saved best
E4 TL2.349 TA0.374 | VL26.041 VA0.002
E5 TL2.160 TA0.420 | VL28.090 VA0.003
Saved best
E6 TL2.017 TA0.458 | VL28.873 VA0.002
E7 TL1.896 TA0.490 | VL30.948 VA0.004
Saved best
E8 TL1.818 TA0.514 | VL31.837 VA0.001
E9 TL1.741 TA0.532 | VL32.131 VA0.002
E10 TL1.646 TA0.557 | VL34.428 VA0.001
E11 TL1.572 TA0.577 | VL32.965 VA0.002
E12 TL1.526 TA0.590 | VL33.038 VA0.002
E13 TL1.464 TA0.606 | VL34.534 VA0.003
E14 TL1.426 TA0.614 | VL35.147 VA0.003
E15 TL1.369 TA0.627 | VL35.689 VA0.001
E16 TL1.322 TA0.641 | VL36.279 VA0.001
E17 TL1.290 TA0.653 | VL35.566 VA0.002
E18 TL1.262 TA0.657 | VL36.250 VA0.002
E19 TL1.226 TA0.668 | VL38.021 VA0.002
E20 TL1.197 TA0.675 | VL39.487 VA0.001
E21 TL1.157 TA0.687 | VL37.658 VA0.002
E22 TL1.132 TA0.691 | VL39.156 VA0.001
E23 TL1.102 TA0.701 | VL38.584 VA0.001
E24 TL1.076 TA0.705 | VL39.434 VA0.001
E25 TL1.056 

In [3]:
# ============================================================
# HUMAN ACTION RECOGNITION – VIDEO DATASET (UCF-style)
# Videos -> MediaPipe Pose -> 30-frame Sequences -> CNN + BiLSTM
# Modes: build -> train -> realtime
# Author: Alok Rathour
# ============================================================

# -------------------- MODE --------------------
# "build"    : Read videos using train.csv -> create har_video_dataset.npz
# "train"    : Train CNN + BiLSTM on pose sequences
# "realtime" : Run live webcam detection
MODE = "realtime"   # change to: build / train / realtime

# -------------------- PATHS --------------------
ROOT = "."                # CDAC Project folder
TRAIN_CSV = "train.csv"   # has: clip_name, clip_path, label
DATA_PATH = "har_video_dataset.npz"

SEQUENCE_LENGTH = 30
MAX_SEQS_PER_VIDEO = 10    # limit per video to keep dataset size manageable
EPOCHS = 30
BATCH_SIZE = 8

# -------------------- IMPORTS --------------------
import os
import cv2
import numpy as np
import pandas as pd
from collections import Counter, deque

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import mediapipe as mp

# -------------------- BASIC SETUP --------------------
NUM_JOINTS = 33
CHECKPOINT_PATH = "har_video_model.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", DEVICE)

# -------------------- POSE (MEDIAPIPE) --------------------
mp_pose = mp.solutions.pose
pose_detector = mp_pose.Pose(static_image_mode=False, model_complexity=1)

def extract_pose(frame):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = pose_detector.process(rgb)
    pts = []
    if res.pose_landmarks:
        for lm in res.pose_landmarks.landmark:
            pts.append([lm.x, lm.y])
    else:
        for _ in range(NUM_JOINTS):
            pts.append([0.0, 0.0])
    return np.array(pts, dtype=np.float32)

# -------------------- DATASET CLASS --------------------
class HARDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        x = torch.tensor(self.X[i], dtype=torch.float32).unsqueeze(1)
        y = torch.tensor(self.y[i], dtype=torch.long)
        return x, y

def load_npz(path):
    d = np.load(path, allow_pickle=True)
    return list(d["X"]), list(d["y"]), list(d["class_names"])

# -------------------- MODEL (STABLE) --------------------
class CNN_BiLSTM(nn.Module):
    def __init__(self, ncls):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1))
        )
        self.feature = 32 * (NUM_JOINTS//4) * 2
        self.lstm = nn.LSTM(self.feature, 64, num_layers=1, batch_first=True, bidirectional=True)
        self.head = nn.Sequential(
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.5), nn.Linear(64, ncls)
        )
    def forward(self, x):
        B,T,C,H,W = x.shape
        x = x.view(B*T, C, H, W)
        x = self.cnn(x)
        x = x.view(B, T, -1)
        o,_ = self.lstm(x)
        return self.head(o[:,-1,:])

# -------------------- BUILD FROM VIDEOS --------------------
def build_from_videos():
    df = pd.read_csv(TRAIN_CSV)
    df["label"] = df["label"].astype(str).str.strip()

    class_names = sorted(df["label"].unique())
    label_map = {c:i for i,c in enumerate(class_names)}

    print("Detected classes:", class_names)
    print("Total classes:", len(class_names))

    X, y = [], []

    for _, row in df.iterrows():
        rel_path = row["clip_path"].lstrip("/")
        video_path = os.path.join(ROOT, rel_path)
        label = label_map[row["label"]]

        if not os.path.exists(video_path):
            continue

        cap = cv2.VideoCapture(video_path)
        poses = []
        while True:
            ok, fr = cap.read()
            if not ok:
                break
            poses.append(extract_pose(fr))
        cap.release()

        seq_count = 0
        for i in range(0, len(poses) - SEQUENCE_LENGTH + 1, SEQUENCE_LENGTH):
            X.append(np.array(poses[i:i+SEQUENCE_LENGTH]))
            y.append(label)
            seq_count += 1
            if seq_count >= MAX_SEQS_PER_VIDEO:
                break

    X = np.array(X)
    y = np.array(y)

    np.savez(DATA_PATH, X=X, y=y, class_names=np.array(class_names))
    print("Dataset created:", DATA_PATH)
    print("X:", X.shape, "y:", y.shape)

# -------------------- TRAINING HELPERS --------------------
def accuracy(p, y):
    return (p.argmax(1) == y).float().mean().item()

def balanced_loader(ds, bs):
    cnt = Counter(ds.y)
    print("Class Dist:", cnt)
    w = {c:1.0/cnt[c] for c in cnt}
    sw = [w[l] for l in ds.y]
    sampler = WeightedRandomSampler(sw, len(sw), replacement=True)
    return DataLoader(ds, batch_size=bs, sampler=sampler)

# -------------------- RUN --------------------
if MODE == "build":
    build_from_videos()

elif MODE == "train":
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(DATA_PATH)

    X, y, class_names = load_npz(DATA_PATH)
    NUM_CLASSES = len(class_names)

    idx = np.random.permutation(len(X))
    split = int(0.8 * len(X))
    tr_idx, va_idx = idx[:split], idx[split:]

    tr = HARDataset([X[i] for i in tr_idx], [y[i] for i in tr_idx])
    va = HARDataset([X[i] for i in va_idx], [y[i] for i in va_idx])

    model = CNN_BiLSTM(NUM_CLASSES).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    criterion = nn.CrossEntropyLoss()

    tr_ld = balanced_loader(tr, BATCH_SIZE)
    va_ld = DataLoader(va, BATCH_SIZE, shuffle=False)

    best = 0.0
    for e in range(EPOCHS):
        model.train(); tl=ta=0
        for xb, yb in tr_ld:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward(); optimizer.step()
            tl += loss.item(); ta += accuracy(out, yb)
        tl/=len(tr_ld); ta/=len(tr_ld)

        model.eval(); vl=va=0
        with torch.no_grad():
            for xb, yb in va_ld:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                out = model(xb)
                vl += criterion(out, yb).item()
                va += accuracy(out, yb)
        vl/=len(va_ld); va/=len(va_ld)

        print(f"E{e+1} TL{tl:.3f} TA{ta:.3f} | VL{vl:.3f} VA{va:.3f}")
        if va > best:
            best = va
            torch.save({"model": model.state_dict(), "classes": class_names}, CHECKPOINT_PATH)
            print("Saved best")

elif MODE == "realtime":
    ck = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    class_names = ck["classes"]
    NUM_CLASSES = len(class_names)

    model = CNN_BiLSTM(NUM_CLASSES).to(DEVICE)
    model.load_state_dict(ck["model"]); model.eval()

    cap = cv2.VideoCapture(0)
    buf = deque(maxlen=SEQUENCE_LENGTH)

    while True:
        ok, fr = cap.read()
        if not ok: break
        buf.append(extract_pose(fr))

        txt = "Collecting"
        if len(buf) == SEQUENCE_LENGTH:
            x = torch.tensor(np.array(buf), dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(DEVICE)
            with torch.no_grad():
                out = model(x)
                p = torch.softmax(out, 1)
                c, i = p.max(1)
            txt = f"{class_names[int(i.item())]} ({float(c.item()):.2f})"

        cv2.putText(fr, txt, (20,40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
        cv2.imshow("HAR", fr)
        if cv2.waitKey(1) & 0xFF == ord('q'): break

    cap.release(); cv2.destroyAllWindows()

Running on: cuda




In [2]:
import sqlite3
import pandas as pd

con = sqlite3.connect("actions.db")
df = pd.read_sql("SELECT * FROM action_logs", con)
con.close()

df.head()

Unnamed: 0,id,ts,action,confidence
0,1,2026-01-22 09:10:25,BabyCrawling,0.965336
1,2,2026-01-22 09:10:26,BabyCrawling,0.90554
2,3,2026-01-22 09:10:27,BabyCrawling,0.943782
3,4,2026-01-22 09:10:44,PushUps,0.916348
4,5,2026-01-22 09:10:44,PushUps,0.809909
