In [16]:
import os
import json
import torch
import random
import librosa
import numpy as np

from transformers import AutoProcessor
from hyperpyyaml import load_hyperpyyaml

from trainer.utils import load_lora
from dataset.vggsound_clsmc import ReproduceVGGSoundCLSMC
# you can pass your own function if you have a different file structure
# my audio is named: _id + '_' + str(int(start_second)).zfill(6) + '.wav'
# e.g. audio_root/-Lmibx_Iu_E_000173.wav
# '-Lmibx_Iu_E' is the youtube id; 000173 is the start second
from dataset.vggsound_clsmc import get_audio_path

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [2]:
audio_root = '/engram/naplab/shared/VGGSound/audio'
video_root = '/engram/naplab/shared/VGGSound/scratch/shared/beegfs/hchen/train_data/VGGSound_final/video'
test_json = 'manifests/VGGSound/good/good_test_uni.json'
label_distributions_json = 'manifests/VGGSound/label_distributions.json'
neg_choices_by = 'distribution'
n_choice = 4

# This was generated before; Load it for consistency
QA_json = f'manifests/VGGSound/QA/good_test_uni_MC{str(n_choice)}_{neg_choices_by}.json'

# Results saved here; Make sure the file name matches QA_json
results_json = f'results/VGGSound/QA/good_test_uni_MC{str(n_choice)}_{neg_choices_by}/qwen2audio_ALL_VGGSound_MC4_distribution_r16_ep1'

In [3]:
ckpt_path = '/engram/naplab/users/xj2289/ckpts/project_synesthesia/Qwen2Audio/checkpoint-17000'
hparams_file = 'hparams/sft/qwen2audio_all_vggsound_clsmc.yaml'

## Generate Q/A for this first time. Load it later for evaluation consistency. 

In [4]:
# from dataset.vggsound_clsmc import RandomVGGSoundCLSMC

# test_set = RandomVGGSoundCLSMC(
#     manifest_path=test_json,
#     label_distributions_path=label_distributions_json,
#     neg_choices_by='distribution',
#     pos_choice_pos='random',
#     n_choice=n_choice,
# )

# QAs = [x for x in test_set]

# print(len(QAs))
# print(QAs[0])

# json.dump(QAs, open(QA_json, 'w'), indent=4)

## Load data

In [5]:
test_set = ReproduceVGGSoundCLSMC(
    qa_path=QA_json,
    data_root=audio_root,
    get_path_fn=get_audio_path
)

## Load model

In [6]:
overrides = {
    'experiment': 'eval',
    'rank': 16
}

with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin, overrides)

model = hparams['lora_qwen']
model.print_trainable_parameters()

processor = AutoProcessor.from_pretrained('Qwen/Qwen2-Audio-7B-Instruct')

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 51,773,440 || all params: 8,448,868,352 || trainable%: 0.612785497927001




In [11]:
load_lora(model, os.path.join(ckpt_path, 'lora.pt'))

In [12]:
model = model.cuda()

In [18]:
@torch.no_grad()
def infer(wav_path, question, max_length=256):
    
    conversation = [
        {'role': 'system', 'content': 'You are a helpful assistant.'}, 
        {'role': 'user', 'content': [
            {'type': 'audio', 'audio_url': wav_path},
            {'type': 'text', 'text': question},
        ]},
    ]
    
    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    audios = []
    for message in conversation:
        if isinstance(message['content'], list):
            for ele in message['content']:
                if ele['type'] == 'audio':
                    audios.append(
                        librosa.load(
                            wav_path, 
                            sr=processor.feature_extractor.sampling_rate)[0]
                    )

    inputs = processor(text=text, audios=audios, sampling_rate=16000, return_tensors='pt', padding=True)
    
    for k, v in inputs.items():
        inputs[k] = v.cuda()
    
    generate_ids = model.generate(**inputs, max_length=max_length)
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    return response


## Metrics

In [19]:
import re

choices = [chr(i) for i in range(ord('A'), ord('A') + n_choice)]

def accuracy(answer, solution):
    answer = answer.upper()
    solution = solution.upper()
    
    answer = re.sub(r'[^A-Z]', '', answer) # only keep letter
    
    if len(answer) != 1:
        if answer[0] in choices: # [A, B, C, D]
            answer = answer[0]
        elif answer[-1] in choices: # [A, B, C, D]
            answer = answer[-1]
        else:
            print(answer)
            answer = ''
    
    return answer, answer == solution
    

## Evaluate!

In [None]:
from tqdm import tqdm

total = 0
correct = 0
tqdm_bar = tqdm(test_set)
results = []

for x in tqdm_bar:
    answer = infer(x['path'], x['question'], max_length=256)
    answer, correct_bool = accuracy(answer, x['solution'])
    
    x['correct'] = correct_bool    
    x['answer'] = answer
    correct += int(correct_bool)
    total += 1
    results.append(x)
    
    tqdm_bar.set_description(f"Running Acc: {str(round(correct/total*100, 2))}%")
    

  0%|          | 0/14202 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
Running Acc: 95.91%:   3%|▎         | 391/14202 [03:49<2:40:47,  1.43it/s]

## Save results

In [11]:
json.dump(results, open(results_json, 'w'), indent=4)

In [16]:
type(model)

transformers.models.qwen2_audio.modeling_qwen2_audio.Qwen2AudioForConditionalGeneration