In [81]:
import os, cv2, random, argparse
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
#get also f1    
from sklearn.metrics import f1_score
import pandas as pd
from helpers import corresponding_label_to_video, get_rtf_text
from model import EventClassifier
from collections import deque
NUM_FRAMES_INPUT = 16  # Number of frames to sample from each video


In [82]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_to_idx = {'No event': 0, 'Fire': 1, 'Smoke': 2}
model = EventClassifier(num_labels=len(class_to_idx)).to(device)
TYPE  = 'baseline_aug_diff'  # Change to 'train' or 'test' as needed
ROOT_DIR = f'E:/2025_ICIAP_FIRE/output/{TYPE}'
OUTPUT_DIR = f'E:/2025_ICIAP_FIRE/output/{TYPE}/results'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
PATH_TO_MODEL = os.path.join(ROOT_DIR, 'event_classifier.pth')

model.load_state_dict(torch.load(PATH_TO_MODEL, map_location=device))
CONTROLLER_FRAMES = 4
MAX_FRAMES = 20000  # Maximum number of frames to process per video
PATH_TO_CSV = os.path.join(OUTPUT_DIR, f'results_{CONTROLLER_FRAMES}.csv')
header = ['video_name', 'current_second', 'No_event', 'Fire', 'Smoke']
df = pd.DataFrame(columns=header)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text Model total parameters: 124739328 trainable parameters: 297216
Video Encoder total parameters: 86227200 trainable parameters: 0


In [83]:
def load_and_split(videos_path, labels_path, seed, split_ratio=0.8):
    vids, lbls = corresponding_label_to_video(videos_path, labels_path)
    combined = list(zip(vids, lbls))
    random.seed(seed); random.shuffle(combined)
    vids, lbls = zip(*combined)
    idx = int(len(vids) * split_ratio)
    return vids[:idx], lbls[:idx], vids[idx:], lbls[idx:]


def sample_or_pad(frames):
    """ Return exactly NUM_FRAMES_INPUT frames: random sample or pad with repeats/zeros. """
    if len(frames) >= NUM_FRAMES_INPUT:
        return random.sample(frames, NUM_FRAMES_INPUT)
    pad_count = NUM_FRAMES_INPUT - len(frames)
    if frames:
        return frames + [frames[-1]] * pad_count
    return [np.zeros((224,224,3), dtype=np.uint8)] * NUM_FRAMES_INPUT

In [84]:
path_to_videos = 'E:/2025_ICIAP_FIRE/dataset'
path_to_labels = 'E:/2025_ICIAP_FIRE/GT'
train_v, train_l, val_v, val_l = load_and_split(path_to_videos, path_to_labels, 42)

Found 353 video files and 353 label files.


In [85]:
def evaluate(model, val_videos, val_labels, class_to_idx, fps_out, device,df):
    model.eval()
    with torch.no_grad():
        for vid, lbl in tqdm(zip(val_videos, val_labels), total=len(val_videos), desc="Validating"):
            current_row = {'video_name': vid, 'current_second': [], 'No_event': [], 'Fire': [], 'Smoke': []}
            cap = cv2.VideoCapture(vid)


            if not cap.isOpened(): continue

            timestart, cls_event = get_rtf_text(lbl)

            
         
            base_idx = []
            for cls in cls_event:
                if cls in class_to_idx:
                    base_idx.append(class_to_idx[cls])
                else:
                    print(f"Warning: Class '{cls}' not found in class_to_idx. Using 'No event' instead.")
                    base_idx.append(class_to_idx["No event"])

            frame_buf, prev_kvs = [], None
            idx_in, idx_out = -1, -1
            fps_in = cap.get(cv2.CAP_PROP_FPS)
            fcount = 0

            while cap.isOpened():
                ret, frame = cap.read()
                if fcount >= MAX_FRAMES: 
                    print(f"Reached maximum frame count ({MAX_FRAMES}) for video {vid}. Stopping.")
                    break
                if not ret: break
                idx_in += 1; fcount += 1
                out_due = int(idx_in / fps_in * fps_out)
                if out_due > idx_out:
                    idx_out += 1
                    frame_buf.append(frame)

                if len(frame_buf) >= NUM_FRAMES_INPUT:
                    batch = sample_or_pad(frame_buf)
                    frame_buf = []
                else:
                    continue

                cls_idx = base_idx
                if (fcount / fps_in) < timestart:
                    cls_idx = class_to_idx["No event"]
                gt_vector = [0] * len(class_to_idx)
                if isinstance(cls_idx, int):
                    cls_idx = [cls_idx]
                for label in cls_idx:
                    gt_vector[label] = 1
                batch = [cv2.resize(f, (224,224)) for f in batch]
                preds, prev_kvs = model(batch, old_past_key_values=prev_kvs)
                # Append the sigmoid output
                probs = torch.sigmoid(preds)
                raw_preds = preds
                preds = (probs > 0.5).int().cpu().tolist()
                #if preds is a single value, make it a list and convert to one-hot encoding
                if isinstance(preds, int):
                    empty_vector = [0] * len(class_to_idx)
                    empty_vector[preds] = 1
                    preds = empty_vector
                    #rouding the current second to the nearest second
                current_row['current_second']= int(round(fcount / fps_in, 0))
                preds= preds[0]
                current_row['No_event'] = preds[0]
                current_row['Fire']= preds[1]
                current_row['Smoke']= preds[2]
                df = pd.concat([df, pd.DataFrame([current_row])], ignore_index=True)
                #save the current dataframe to csv
                df.to_csv(PATH_TO_CSV, index=False)
                
            # Handle any remaining frames in the buffer

            if frame_buf:
                batch = sample_or_pad(frame_buf)
                cls_idx = base_idx
                if (fcount / fps_in) < timestart:
                    cls_idx = class_to_idx["No event"]
                gt_vector = [0] * len(class_to_idx)
                if isinstance(cls_idx, int):
                    cls_idx = [cls_idx]
                for label in cls_idx:
                    gt_vector[label] = 1
                batch = [cv2.resize(f, (224,224)) for f in batch]
                preds, prev_kvs = model(batch, old_past_key_values=prev_kvs)
                # Append the sigmoid output
                probs = torch.sigmoid(preds)
                raw_preds = preds
                preds = (probs > 0.5).int().cpu().tolist()
                
                #if preds is a single value, make it a list and convert to one-hot encoding
                if isinstance(preds, int):
                    empty_vector = [0] * len(class_to_idx)
                    empty_vector[preds] = 1
                preds= preds[0]
                current_row['current_second'] = int(round(fcount / fps_in, 0))
                current_row['No_event'] = preds[0]
                current_row['Fire'] = preds[1]
                current_row['Smoke'] = preds[2]
                df = pd.concat([df, pd.DataFrame([current_row])], ignore_index=True)
                #save the current dataframe to csv
                df.to_csv(PATH_TO_CSV, index=False)
            cap.release()
    return df
    


    

In [86]:

    
df = evaluate(model, val_v, val_l, class_to_idx, CONTROLLER_FRAMES, device, df)
df.to_csv(PATH_TO_CSV, index=False)
print(f"Results saved to {PATH_TO_CSV}")

Validating:   4%|▍         | 3/71 [00:41<15:37, 13.79s/it]

Reached maximum frame count (20000) for video E:/2025_ICIAP_FIRE/dataset\1\Video192.mp4. Stopping.


Validating: 100%|██████████| 71/71 [09:24<00:00,  7.95s/it]

Results saved to E:/2025_ICIAP_FIRE/output/baseline_aug_diff/results\results_4.csv



