In [2]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification
import torch
import torchaudio


feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

model = model.to('cuda:0')

def inference_top_k(fpath, k=5, vname='Sliding door', with_logit=False):
    data, sr = torchaudio.load(fpath)
    data = torchaudio.functional.resample(data, orig_freq=sr, new_freq=16000)
    data = data.squeeze()

    # audio file is decoded on the fly
    inputs = feature_extractor(data, sampling_rate=16000, return_tensors="pt")
    inputs = inputs.to('cuda:0')
    
    with torch.no_grad():
        logits = model(**inputs).logits

    logits = logits.squeeze()

    predicted_class_ids = torch.argsort(logits)[-k:]
    predicted_label = [model.config.id2label[_id.item()] for _id in predicted_class_ids]

    if with_logit:
        return predicted_label, sorted(logits)[-k:]
    else:
        return predicted_label

  from .autonotebook import tqdm as notebook_tqdm


In [59]:
tqdm_bar = tqdm(glob('/home/aicontest/DF/data/audio/test/*.ogg'))
for file in tqdm_bar:
    removed = moons_removal(file)

    file_name = file.split("/")[-1][:-3] + "wav"
    file_path = f"/home/aicontest/DF/data/audio/moon_test_data/{file_name}"
    torchaudio.save(file_path, removed, 16000)

100%|██████████| 50000/50000 [12:11<00:00, 68.39it/s]  


In [16]:
non_speech_list = []

with open("../Audio-Denoising/non_speech.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        file_path = line.split(' ')[0]
        file_name = file_path.split('/')[-1][:-4]
        non_speech_list.append(file_name)


In [17]:
import pandas as pd

submission_df = pd.read_csv("/home/aicontest/DF/result/uijin/submit_240703_173537.csv")
submission_df.loc[submission_df['id'].isin(non_speech_list), ['fake', 'real']] = 0
submission_df

Unnamed: 0,id,fake,real
0,TEST_00000,0.999667,0.999947
1,TEST_00001,0.999965,0.987061
2,TEST_00002,0.999998,0.097766
3,TEST_00003,0.000035,0.999998
4,TEST_00004,1.000000,0.000006
...,...,...,...
49995,TEST_49995,0.022392,0.966091
49996,TEST_49996,0.998367,0.008262
49997,TEST_49997,0.999999,0.965079
49998,TEST_49998,0.000326,0.999932


In [20]:
submission_df.to_csv("./masked_result.csv", index=False)

In [3]:
path = "/home/aicontest/DF/data/audio/test/TEST_00178.ogg"
labels, prob = inference_top_k(fpath=path, k=5, with_logit=True)

print(labels)
print(prob)
if 'Narration, monologue' in labels and 'Speech' in labels:
    print("This!")


['Inside, small room', 'Music', 'Speech', 'Siren', 'Civil defense siren']
[tensor(-4.8782, device='cuda:0'), tensor(-3.5884, device='cuda:0'), tensor(0.1119, device='cuda:0'), tensor(0.1708, device='cuda:0'), tensor(0.3438, device='cuda:0')]


In [53]:
from glob import glob
from tqdm import tqdm
from collections import defaultdict

d = defaultdict(int)

counts = 0

paths = glob("/home/aicontest/DF/data/audio/train/*.ogg")
avg = []
tqdm_bar = tqdm(paths)



with open("./test_only_speech_list.txt", "w") as tf:
    for path in tqdm_bar:
        labels, prob = inference_top_k(fpath=path, k=2, with_logit=True)

        for label in labels:
            d[label] += 1

        if labels[-1] == 'Speech' and prob[-1].item() > 1.4:
            tf.write(path + "\n")
            counts += 1

        tqdm_bar.set_postfix(only_speech=d)

  0%|          | 0/55438 [00:00<?, ?it/s]

  5%|▍         | 2567/55438 [02:28<51:06, 17.24it/s, only_speech=defaultdict(<class 'int'>, {'Male speech, man speaking': 761, 'Speech': 2565, 'Female speech, woman speaking': 379, 'Speech synthesizer': 580, 'Narration, monologue': 311, 'Sound effect': 68, 'Inside, small room': 34, 'Clicking': 7, 'White noise': 9, 'Heart sounds, heartbeat': 13, 'Hum': 17, 'Gasp': 19, 'Animal': 31, 'Mains hum': 2, 'Spray': 3, 'Conversation': 44, 'Raindrop': 1, 'Owl': 3, 'Zipper (clothing)': 29, 'Knock': 5, 'Music': 30, 'Chopping (food)': 4, 'Bird': 2, 'Chink, clink': 12, 'Oink': 6, 'Throat clearing': 24, 'Slap, smack': 30, 'Tick-tock': 5, 'Single-lens reflex camera': 30, 'Frog': 6, 'Biting': 2, 'Static': 6, 'Vehicle': 8, 'Chop': 1, 'Mouse': 8, 'Meow': 3, 'Finger snapping': 3, 'Crunch': 4, 'Door': 1, 'Typing': 1, 'Whistle': 2, 'Television': 14, 'Rain on surface': 2, 'Coo': 1, 'Rustling leaves': 2, 'Pink noise': 2, 'Telephone': 1, 'Writing': 1, 'Plop': 2, 'Cattle, bovinae': 2, 'Tick': 5, 'Water tap, fauce

KeyboardInterrupt: 