In [1]:
import json
import torch
import random
import librosa
import numpy as np

from dataset.vggsound_clsmc import ReproduceVGGSoundCLSMC
# you can pass your own function if you have a different file structure
# my audio is named: _id + '_' + str(int(start_second)).zfill(6) + '.wav'
# e.g. audio_root/-Lmibx_Iu_E_000173.wav
# '-Lmibx_Iu_E' is the youtube id; 000173 is the start second
from dataset.vggsound_clsmc import get_video_path

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [2]:
audio_root = '/engram/naplab/shared/VGGSound/audio'
video_root = '/engram/naplab/shared/VGGSound/scratch/shared/beegfs/hchen/train_data/VGGSound_final/video'
test_json = 'manifests/VGGSound/good/good_test_uni.json'
label_distributions_json = 'manifests/VGGSound/label_distributions.json'
neg_choices_by = 'distribution'
n_choice = 4

# This was generated before; Load it for consistency
QA_json = f'manifests/VGGSound/QA/good_test_uni_MC{str(n_choice)}_{neg_choices_by}.json'

# Results saved here; Make sure the file name matches QA_json
FPS = 1
MAXP = 512 * 512
results_json = f'results/VGGSound/QA/good_test_uni_MC{str(n_choice)}_{neg_choices_by}/Qwen2-VL-7B-Instruct-FPS{str(FPS)}-MP{str(MAXP)}.json'

## Generate Q/A for this first time. Load it later for evaluation consistency. 

In [3]:
print('Should be generated already in eval_qwen2audio_clsmc')

Should be generated already in eval_qwen2audio_clsmc


## Load data

In [4]:
test_set = ReproduceVGGSoundCLSMC(
    qa_path=QA_json,
    data_root=video_root,
    get_path_fn=get_video_path
)

## Load model

In [5]:
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", 
    torch_dtype=torch.bfloat16, 
    # attn_implementation="flash_attention_2",
    device_map="cuda"
)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
@torch.no_grad()
def infer(video_path, question, max_new_tokens=32, fps=1.0, max_pixels=512*512):
    
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'}, 
        {'role': 'user', 'content': [
            {'type': 'video', 'video': video_path, 'fps': fps, 'max_pixels': max_pixels},
            {'type': 'text', 'text': question},
        ]},
    ]
    
    text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    image_inputs, video_inputs = process_vision_info(messages)
    
    assert image_inputs == None

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    return output_text


## Metrics

In [7]:
import re

choices = [chr(i) for i in range(ord('A'), ord('A') + n_choice)]

def accuracy(answer, solution):
    answer = answer.upper()
    solution = solution.upper()
    
    answer = re.sub(r'[^A-Z]', '', answer) # only keep letter
    
    if len(answer) != 1:
        if answer[0] in choices: # [A, B, C, D]
            answer = answer[0]
        elif answer[-1] in choices: # [A, B, C, D]
            answer = answer[-1]
        else:
            print(answer)
            answer = ''
    
    return answer, answer == solution
    

## Evaluate!

In [None]:
from tqdm import tqdm

total = 0
correct = 0
tqdm_bar = tqdm(test_set)
results = []

for x in tqdm_bar:
    with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        answer = infer(x['path'], x['question'], max_new_tokens=32, fps=FPS, max_pixels=MAXP)
    answer, correct_bool = accuracy(answer, x['solution'])
    
    x['correct'] = correct_bool 
    x['answer'] = answer
    correct += int(correct_bool)
    total += 1
    results.append(x)
    
    tqdm_bar.set_description(f"Running Acc: {str(round(correct/total*100, 2))}%")


  0%|          | 0/14202 [00:00<?, ?it/s]qwen-vl-utils using decord to read video.
Running Acc: 81.67%:   0%|          | 60/14202 [01:19<5:02:01,  1.28s/it]

## Save results

In [None]:
json.dump(results, open(results_json, 'w'), indent=4)