In [None]:
# download data from aws

import csv

# read split csvs
videos = []
for file in ["test.csv", "train.csv", "val.csv"]:
    with open(file, 'r') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            line = str(row).split(" ")
            path = "/".join(line[0].split("/")[4:])
            videos.append(path)

# use aws cli to download videos
for video in videos:
    path = "dataset/guesswhat/" + "/".join(video.split("/")[:-1])
    !mkdir -p {path}
    !aws s3api get-object --bucket headsup-du1r3b78fy --key {video} dataset/guesswhat/{video} > download.txt


# reformat split csvs
for file in ["test.csv", "train.csv", "val.csv"]:
    with open(file, 'w') as f1:
        with open("split/"+file, 'r') as f2:
            csv_reader = csv.reader(f2)
            csv_writer = csv.writer(f1)
            for row in csv_reader:
                line = "dataset/guesswhat/" + "/".join(str(row)[:-2].split("/")[4:])
                f1.write(line + "\n")
            

To fine-tune the video foundation models, I used the code provided in the <a href="https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo1/Pretrain/VideoMAE">VideoMAE GitHub</a> and made a few changes to accomodate for the "GuessWhat" dataset and binary classification task. 

<img src="images/1.png" alt="drawing" width="500"/>
<img src="images/2.png" alt="drawing" width="500"/>

The remaining changes consisted of minor, one-line edits scattered throughout the python project. These modifications were to adjust various aspects, such as the number of frames, and to remove/rework code that was causing errors :o. 

In [None]:
# sample script to fine-tune model  
# change model configuration for difference experiments

#!/bin/bash

# Define paths and any environment variables first
base_path='/home/cathyhou'

OUTPUT_DIR="$base_path/InternVideo/InternVideo/InternVideo1/Pretrain/VideoMAE/outputs/test_model/tta_ft_k400_ft_ssbd"
DATA_PATH="$base_path/splits"
MODEL_PATH="$base_path/models/tta_ft_k400_ft_ssbd.pth"

export CUDA_LAUNCH_BLOCKING=1
# Environment variables
export MASTER_PORT=$((12000 + RANDOM % 20001))
export OMP_NUM_THREADS=1
export DS_BUILD_OPS=1

# Execute the python script with necessary parameters using -u for unbuffered output
python -u run_class_linear.py \
    --model vit_base_patch16_224 \
    --data_set GW \
    --nb_classes 2 \
    --data_path "$DATA_PATH" \
    --finetune "$MODEL_PATH" \
    --log_dir "$OUTPUT_DIR" \
    --output_dir "$OUTPUT_DIR" \
    --batch_size 8 \
    --input_size 224 \
    --short_side_size 224 \
    --save_ckpt_freq 10 \
    --num_frames 16 \
    --sampling_rate 8 \
    --opt adamw \
    --lr 1e-3 \
    --layer_decay 0.90 \
    --num_workers 1 \
    --opt_betas 0.9 0.999 \
    --weight_decay 0.05 \
    --epochs 10 \
    --drop_path 0.35 \
    --auto_resume \
    --test_num_segment 2 \
    --test_num_crop 3 \
    --dist_eval --enable_deepspeed

In [2]:
# generate results

import csv
import numpy as np
from scipy.special import softmax
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

csv_path = "/home/cathyhou/splits/test.csv"
results_dir = "/home/cathyhou/InternVideo/InternVideo/InternVideo1/Pretrain/VideoMAE/results"

# make dict {shortened path: full path}
videos = {}
for line in open(csv_path):
    path = line.split()[0]
    videos["/".join(path.split("/")[-2:]).split(".")[0]] = path
       
# generate results csv 
def compute_video(lst):
    i, video_id, data, label = lst
    feat = [x for x in data]
    feat = np.mean(feat, axis=0)
    pred = np.argmax(feat)
    return [videos[video_id.strip()],label,pred]

def generate_results(test_file, results_dir, results_filename): 
    dict_feats = {}
    dict_label = {}
    dict_pos = {}

    lines = open(test_file, 'r').readlines()[1:]
    for line in lines:
        line = line.strip()
        name = line.split('[')[0]
        label = line.split(']')[1].split(' ')[1]
        chunk_nb = line.split(']')[1].split(' ')[2]
        split_nb = line.split(']')[1].split(' ')[3]
        data = np.fromstring(line.split('[')[1].split(']')[0],
                             dtype=float,
                             sep=',')
        if not name in dict_feats:
            dict_feats[name] = []
            dict_label[name] = 0
            dict_pos[name] = []
        if chunk_nb + split_nb in dict_pos[name]:
            continue
        dict_feats[name].append(softmax(data))
        dict_pos[name].append(chunk_nb + split_nb)
        dict_label[name] = label

    input_lst = []
    print(len(dict_feats))
    for i, item in enumerate(dict_feats):
        input_lst.append([i, item, dict_feats[item], dict_label[item]])

    with open(results_dir+results_filename, 'w') as f:
        writer = csv.writer(f)
        field = ["Path", "True Label", "Predicted Label"]
        writer.writerow(field)
        for input in input_lst:
            writer.writerow(compute_video(input))
            
# get scores
def get_scores(results_dir, results_filename):
    preds = []
    labels = []

    for line in open(results_dir+results_filename).readlines()[1:]:
        preds.append(int(line.split(",")[2]))
        labels.append(int(line.split(",")[1]))

    print("accuracy: ", accuracy_score(labels, preds))
    print("precision: ", precision_score(labels, preds))
    print("recall: ", recall_score(labels, preds))
    print("f1 score: ", f1_score(labels, preds))

In [None]:
!sh exp1.sh

In [98]:
results_filename = "/exp1_ssbd.csv"
test_file = "/home/cathyhou/InternVideo/InternVideo/InternVideo1/Pretrain/VideoMAE/outputs/exp1/tta_ft_k400_ft_ssbd/0.txt"

generate_results(test_file, results_dir, results_filename)
get_scores(results_dir, results_filename)

43
accuracy:  0.7674418604651163
precision:  0.8260869565217391
recall:  0.76
f1 score:  0.7916666666666666


In [3]:
results_filename = "/exp4_intern.csv"
test_file = "/home/cathyhou/InternVideo/InternVideo/InternVideo1/Pretrain/VideoMAE/outputs/exp4/vit_b_hybrid_pt_800e/0.txt"

generate_results(test_file, results_dir, results_filename)
get_scores(results_dir, results_filename)

43
accuracy:  0.627906976744186
precision:  0.6153846153846154
recall:  0.96
f1 score:  0.75
