In [8]:
!pwd

/home/yuyue01


In [9]:
ls

[0m[01;34mdataset[0m/             model.ipynb                     val.csv
[01;34mextracted_frames[0m/    test.csv                        val_dataset.pkl
[01;34mextracted_frames_2[0m/  test_statistics.csv             yolov3.cfg
[01;34mfeatures[0m/            test_statistics_multilayer.csv  yolov3.weights
[01;34mfeatures_dinovitb[0m/   train.csv                       yy_test.ipynb
[01;34mfeatures_dinovits[0m/   train_dataset.pkl


In [10]:
import csv
import cv2
import os
import pandas as pd
import numpy as np

In [12]:
videos = []
for file in ["test.csv", "train.csv", "val.csv"]:
    with open(file, 'r') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            line = str(row).split(" ")
            path = "/".join(line[0].split("/")[4:])
            videos.append(path)
print(videos[-2:])

['_REVIEWED/bangladesh_videos/%00_Good_GW_Gamplay/taniaislam56%40gmail.com/1563001715663/GuessWhat.mp4', '_REVIEWED/bangladesh_videos/%00_Good_GW_Gamplay/taniaislam56%40gmail.com/1563001844485/GuessWhat.mp4']


In [13]:
len(videos)

319

In [None]:
i = 0
for video in videos:
    path = "dataset/guesswhat/" + "/".join(video.split("/")[:-1])
    !mkdir -p {path}
    !aws s3api get-object --bucket headsup-du1r3b78fy --key {video} dataset/guesswhat/{video} > download.txt
    i += 1
    if i % 10 == 0:
        print(i)

In [188]:
def extract_frames(video_path, output_folder, num_frames=32):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"-- Error: Cannot open video file {video_path}")
        return

    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if length == 0:
        print(f"-- Error: Video file {video_path} contains no frames.")
        return
    
    print(f"-- Total frames in video: {length}")
    frame_ids = [int(length / num_frames * i) for i in range(num_frames)]
    # print(f"Selected frame IDs: {frame_ids}")

    count = 0
    frame_id = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count in frame_ids:
            frame_filename = os.path.join(output_folder, f"frame_{frame_id:02d}.jpg")
            cv2.imwrite(frame_filename, frame)
            # print(f"-- Saved frame {frame_id} to {frame_filename}")
            frame_id += 1
        count += 1

    cap.release()
    print(f"-- Finished extracting frames from {video_path} to {output_folder}")
    print()

# ex
video_path = 'dataset/guesswhat/JustTxtFileORJustMp4/%2B12265686668/1585521787706/GuessWhat.mp4'
output_folder = 'extracted_frames_32/JustTxtFileORJustMp4/%2B12265686668/1585521787706'
extract_frames(video_path, output_folder)


-- Total frames in video: 1726
-- Finished extracting frames from dataset/guesswhat/JustTxtFileORJustMp4/%2B12265686668/1585521787706/GuessWhat.mp4 to extracted_frames_32/JustTxtFileORJustMp4/%2B12265686668/1585521787706



In [18]:
# add human detection when sampling

def load_yolo_model(cfg_path, weights_path):
    # Load YOLO model
    net = cv2.dnn.readNet(weights_path, cfg_path)
    layer_names = net.getLayerNames()
    try:
        output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
    except IndexError:
        output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
    return net, output_layers

def detect_humans(frame, net, output_layers):
    height, width, channels = frame.shape
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if class_id == 0 and confidence > 0.5:  # class_id == 0 is for 'person'
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
    return len(indexes) > 0

def extract_frames_with_human_detection(video_path, output_folder, initial_num_frames=128, final_num_frames=16, cfg_path='yolov3.cfg', weights_path='yolov3.weights'):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    net, output_layers = load_yolo_model(cfg_path, weights_path)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"-- Error: Cannot open video file {video_path}")
        return

    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if length == 0:
        print(f"-- Error: Video file {video_path} contains no frames.")
        return

    print(f"-- Total frames in video: {length}")
    frame_ids = [int(length / initial_num_frames * i) for i in range(initial_num_frames)]

    count = 0
    frame_id = 0
    detected_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count in frame_ids:
            if detect_humans(frame, net, output_layers):
                detected_frames.append((frame_id, frame))
                # print(f"-- Detected human in frame {frame_id}")
            frame_id += 1
        count += 1

    cap.release()

    # Select 16 frames evenly spaced among all detected frames
    total_detected = len(detected_frames)
    if total_detected < final_num_frames:
        print(f"-- Warning: Only {total_detected} frames with human detections found, less than {final_num_frames} required.")
        selected_frames = detected_frames
    else:
        interval = total_detected // final_num_frames
        selected_frames = [detected_frames[i * interval] for i in range(final_num_frames)]
    
    for i, (frame_id, frame) in enumerate(selected_frames):
        frame_filename = os.path.join(output_folder, f"frame_{i:02d}.jpg")
        cv2.imwrite(frame_filename, frame)
        # print(f"-- Saved frame {i} with human detection to {frame_filename}")

    print(f"-- Finished extracting and saving {len(selected_frames)} frames from {video_path} to {output_folder}")
    print()

# video_path = 'dataset/guesswhat/_REVIEWED/Testing_ProcessComplete/shimmer_96%40yahoo.com/1569790868469/GuessWhat.mp4'
# output_folder = 'extracted_frames_32/_REVIEWED/Testing_ProcessComplete/shimmer_96%40yahoo.com/1569790868469'
# extract_frames_with_human_detection(video_path, output_folder)


-- Total frames in video: 1733
-- Finished extracting and saving 16 frames from dataset/guesswhat/cumlupo%40gmail.com/1590047043296/GuessWhat.mp4 to extracted_frames_2/cumlupo%40gmail.com/1590047043296



In [154]:
def process_videos(csv_file, base_video_path, output_base_path):
    with open(csv_file, 'r') as f:
        csv_reader = csv.reader(f)

        for row in csv_reader:
            line = str(row).split(" ")
            # video_path, label = line
            video_path = line[0].strip().strip("['")
            relative_path = "/".join(video_path.split("/")[4:])
            full_video_path = os.path.join(base_video_path, relative_path)
            output_folder = os.path.join(output_base_path, "/".join(relative_path.split("/")[:-1]))
            
            if not os.path.isdir(output_folder):
                os.makedirs(output_folder)
            else:
                print("Video file already exists")
                continue
                    
                
            print("Processing video:", full_video_path)
            print("Output folder:", output_folder)
            
            if os.path.isfile(full_video_path):
                extract_frames(full_video_path, output_folder)
                # extract_frames_with_human_detection(full_video_path, output_folder)
            else:
                print("Video file does not exist:", full_video_path)


In [None]:
# Extracting all frames from all videos: takes a while, only extract frames once
base_video_path = 'dataset/guesswhat'
output_base_path = 'extracted_frames_32'
for file in ["train.csv", "test.csv", "val.csv"]:
    process_videos(file, base_video_path, output_base_path)

In [156]:
import torch
from torchvision import transforms
from PIL import Image

In [157]:
# Define the pre-processing transformation and load foundation model
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def load_pretrained_dinov2_model():
    model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
    # model = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16')
    model.eval()
    return model

In [158]:
foundation_model = load_pretrained_dinov2_model()

Using cache found in /home/yuyue01/.cache/torch/hub/facebookresearch_dino_main


In [182]:
def extract_and_concatenate_features(model, frames_folder):
    frame_files = [f for f in sorted(os.listdir(frames_folder)) if f.endswith(".jpg")]

    if len(frame_files) != 32: # 16
        print(f"-- Error: Expected 32 (or 16) frames, but found {len(frame_files)} frames in folder: {frames_folder}")
        return None
    
    feature_list = []
    for frame_file in frame_files:
        if frame_file.endswith(".jpg"): 
            frame_path = os.path.join(frames_folder, frame_file)
            image = Image.open(frame_path).convert('RGB')
            image_tensor = preprocess(image).unsqueeze(0)  # Add batch dimension
            with torch.no_grad():
                features = model(image_tensor)
            feature_list.append(features.squeeze(0).numpy())  # Remove batch dimension
            
    if not feature_list:  # Check if feature_list is empty
        print(f"No features extracted in folder: {frames_folder}")
        return None
    
    concatenated_features = np.concatenate(feature_list, axis=0)
    print("Finished extracting and concatenating features in folder:", frames_folder)
    return concatenated_features

In [183]:
def save_features(features, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    np.save(output_path, features)

In [184]:
def process_all_frames(csv_file, base_extracted_frames_path, features_output_base_path, model):
    with open(csv_file, 'r') as f:
        csv_reader = csv.reader(f)
        
        for row in csv_reader:
            line = str(row).split(" ")
            # video_path, label = line
            video_path = line[0].strip().strip("['")
            relative_path = "/".join(video_path.split("/")[4:])
            frames_folder = os.path.join(base_extracted_frames_path, "/".join(relative_path.split("/")[:-1]))
            output_path = os.path.join(features_output_base_path, "/".join(relative_path.split("/")[:-1]), "features.npy")
            
#             if os.path.isfile(output_path):
#                 print(f"Output path already exists.")
#                 continue
            
            if os.path.isdir(frames_folder):
                print("Processing frames in folder:", frames_folder)
                
                # extract features from frames using foundation model
                concatenated_features = extract_and_concatenate_features(model, frames_folder)
                if concatenated_features is not None:
                    save_features(concatenated_features, output_path)
                    print(f"Saved aggregated features to {output_path}")
                else:
                    print(f"Skipping saving for {frames_folder} due to no extracted features.")
                print()
            else:
                print("Frames folder does not exist:", frames_folder)

In [None]:
# Extract features from frames for all videos (!!!takes a while)
base_extracted_frames_path = 'extracted_frames_32'
features_output_base_path = 'features_dinovits_32'
for file in ["train.csv", "test.csv", "val.csv"]:
    process_all_frames(file, base_extracted_frames_path, features_output_base_path, foundation_model)

In [194]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [195]:
# Prepare data
class VideoFeatureDataset(Dataset):
    def __init__(self, features_base_path, csv_file=None, preloaded_data=None):
        self.features_base_path = features_base_path
        self.data = []
        self.labels = []
        if preloaded_data:
            self.data, self.labels = preloaded_data
        elif csv_file:
            with open(csv_file, 'r') as f:
                csv_reader = csv.reader(f)
                for row in csv_reader:
                    line = str(row).split(" ")
                    video_path = line[0].strip().strip("['")
                    label = line[1].strip().strip("']")
                    relative_path = "/".join(video_path.split("/")[4:])
                    feature_file = os.path.join(features_base_path, "/".join(relative_path.split("/")[:-1]), "features.npy")
                    if os.path.isfile(feature_file):
                        self.data.append(feature_file)
                        self.labels.append(int(label))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        feature_file = self.data[idx]
        # load feature numpy
        features = np.load(feature_file)
        label = self.labels[idx]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [196]:
import pickle

def save_dataset(dataset, file_path):
    data = {'data': dataset.data, 'labels': dataset.labels}
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def load_dataset(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data['data'], data['labels']

In [197]:
# Parameters
train_csv = 'train.csv'
val_csv = 'val.csv'
features_base_path = 'features_dinovits_32'
batch_size = 16

In [198]:
# First time - Create datasets and dataloaders
train_dataset = VideoFeatureDataset(features_base_path, train_csv)
val_dataset = VideoFeatureDataset(features_base_path, val_csv)
# Save datasets
save_dataset(train_dataset, 'train_dataset.pkl')
save_dataset(val_dataset, 'val_dataset.pkl')

In [199]:
try:
    train_data, train_labels = load_dataset('train_dataset.pkl')
    val_data, val_labels = load_dataset('val_dataset.pkl')
    print("Train dataset loaded successfully:", len(train_data))
    print("Validation dataset loaded successfully:", len(val_data))

    # check the first few items
    print("First few training items:", train_data[:3], train_labels[:3])
    print("First few validation items:", val_data[:3], val_labels[:3])
except Exception as e:
    print("Error loading datasets:", e)

# Create datasets using preloaded data
train_dataset = VideoFeatureDataset(features_base_path, preloaded_data=(train_data, train_labels))
val_dataset = VideoFeatureDataset(features_base_path, preloaded_data=(val_data, val_labels))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Train dataset loaded successfully: 228
Validation dataset loaded successfully: 47
First few training items: ['features_dinovits_32/billypano@hotmail.com/1622504001/features.npy', 'features_dinovits_32/billypano@hotmail.com/1622504303/features.npy', 'features_dinovits_32/billypano@hotmail.com/1622504663/features.npy'] [0, 0, 0]
First few validation items: ['features_dinovits_32/_REVIEWED/Testing_ProcessComplete/fardhanaalam%40gmail.com/1560171503917/features.npy', 'features_dinovits_32/_REVIEWED/bangladesh_videos/%02_Flawed_Gameplay/%2B8801925832996/1563950970528/features.npy', 'features_dinovits_32/%2B16383224499/1607082139208/features.npy'] [0, 0, 0]


In [252]:
class SingleLayerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SingleLayerClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        x = self.fc(x)
        return x

# Parameters
# input_dim = 6144
input_dim = 12288
num_classes = 2 

# Instantiate the classifier
clf = SingleLayerClassifier(input_dim, num_classes)

In [262]:
class EnhancedClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(EnhancedClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.5)
        
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(p=0.5)
        
        self.fc4 = nn.Linear(64, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        return x

# Parameters
# input_dim = 6144 
input_dim = 12288
num_classes = 2 
# Instantiate the model
clf = EnhancedClassifier(input_dim, num_classes)

In [263]:
# Loss function and optimizer
batch_size = 16
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(clf.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) 

In [264]:
from sklearn.metrics import precision_recall_fscore_support

In [265]:
# Training....
num_epochs = 20

for epoch in range(num_epochs):
    clf.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = clf(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

    # Validation
    clf.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = clf(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
    
    val_loss /= len(val_loader.dataset)
    accuracy = 100 * correct / total
    precision, recall, f1_score, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
    print(f"Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")
    # Step the learning rate scheduler
    scheduler.step()

Epoch 1/20, Loss: 0.7597
Validation Loss: 0.6953, Accuracy: 53.19%, Precision: 0.5669, Recall: 0.5319, F1 Score: 0.5336
Epoch 2/20, Loss: 0.6930
Validation Loss: 0.6767, Accuracy: 55.32%, Precision: 0.5570, Recall: 0.5532, F1 Score: 0.5549
Epoch 3/20, Loss: 0.6163
Validation Loss: 0.6549, Accuracy: 57.45%, Precision: 0.5545, Recall: 0.5745, F1 Score: 0.5552
Epoch 4/20, Loss: 0.6313
Validation Loss: 0.6606, Accuracy: 57.45%, Precision: 0.5478, Recall: 0.5745, F1 Score: 0.5443
Epoch 5/20, Loss: 0.5501
Validation Loss: 0.6716, Accuracy: 53.19%, Precision: 0.4581, Recall: 0.5319, F1 Score: 0.4652
Epoch 6/20, Loss: 0.4967
Validation Loss: 0.6841, Accuracy: 53.19%, Precision: 0.4581, Recall: 0.5319, F1 Score: 0.4652
Epoch 7/20, Loss: 0.5181
Validation Loss: 0.6829, Accuracy: 57.45%, Precision: 0.5163, Recall: 0.5745, F1 Score: 0.4929
Epoch 8/20, Loss: 0.4808
Validation Loss: 0.6768, Accuracy: 46.81%, Precision: 0.4029, Recall: 0.4681, F1 Score: 0.4224
Epoch 9/20, Loss: 0.5072
Validation Loss

In [266]:
# Evaluate the clf
test_csv = 'test.csv'
features_base_path = 'features_dinovits_32'
test_dataset = VideoFeatureDataset(features_base_path, test_csv)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

clf.eval()
test_loss = 0.0
correct = 0
total = 0

all_labels = []
all_predictions = []

test_statistics = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = clf(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())
        # Collect statistics
        for i in range(len(labels)):
            test_statistics.append([test_dataset.data[i], labels[i].item(), predicted[i].item()])

test_loss /= len(test_loader.dataset)
accuracy = 100 * correct / total
precision, recall, f1_score, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
print(f"Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

# Save test statistics to a CSV file
output_csv = 'test_statistics_multilayer.csv'
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Path", "True Label", "Predicted Label"])
    writer.writerows(test_statistics)

print(f"Test statistics saved to {output_csv}")


Test Loss: 0.6225, Accuracy: 68.18%, Precision: 0.7092, Recall: 0.6818, F1 Score: 0.6515
Test statistics saved to test_statistics_multilayer.csv
