# Evaluating Yamnet Segmentation Model using data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking dataset

In [1]:
import pandas as pd
import os
import ast

In [2]:
audio_directory = os.path.expanduser("~/AC297r/CoughAnalyzer/data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking/")

Get the true cough labels for audio files that do have coughs (files with txt files)

In [3]:
df_w_cough_true = pd.DataFrame(columns=['filename', 'intervals'])
i = 0

for root, dirs, files in os.walk(audio_directory):
    for file in files:
        if file.endswith('.txt'): #change to '.wav' to analyze the CoughSegmentation dataset
            audio_filename = os.path.join(root, file)
            interval_lst = []
            with open(audio_filename, 'r') as f:
              for line in f:
                if line.strip(): # skip empty lines
                  parts = line.strip().split()
                  interval_lst.append([float(parts[0]), float(parts[1])])
            df_w_cough_true.loc[i] = [audio_filename, interval_lst]
            i += 1
df_w_cough_true['filename'] = df_w_cough_true['filename'].apply(
    lambda x: os.path.splitext(os.path.basename(x))[0]
)

In [4]:
df_w_cough_true

Unnamed: 0,filename,intervals
0,1dd3b212-e969-4ede-a9d9-f24b711e2028,"[[0.782855, 1.01326], [1.198501, 1.388324], [2..."
1,7ae1ffe1-2259-411f-8ead-6c107e01e824,"[[1.520353, 2.373774], [4.211789, 4.555528], [..."
2,a4cc4680-8bb6-4646-b9cf-d77a4e8ada21,"[[1.778078, 2.284206], [2.301887, 2.655514], [..."
3,620ded24-220f-4ada-b032-2b5c170b279a,"[[1.349212, 1.763799], [1.807962, 2.026071], [..."
4,99d322ed-c367-4d45-b6d1-f008b47f1af9,"[[0.54401, 1.868028], [1.868028, 2.378799]]"
...,...,...
195,f5a661dc-8161-4842-b1c3-cd7265896101,"[[0.571248, 1.402015], [1.939479, 2.701144], [..."
196,98fb6294-d339-4c83-8ff5-2bbcf82e35e0,"[[0.759539, 2.415583], [2.473171, 2.875249], [..."
197,6dff10f3-5df0-4b5a-be62-613afd6115dc,"[[3.812107, 4.398915], [5.05404, 5.788743], [5..."
198,25b750e5-8a76-4c13-9fd8-851e45d1b5ed,"[[2.450166, 3.222867], [4.159943, 4.705751], [..."


the intervals of coughs detected by yamnet were found using yamnet_evaluation_cough_intevals.py

In [5]:
df_w_cough_yamnet = pd.read_csv('data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking_yamnet_cough_intervals.csv')
df_w_cough_yamnet['filename'] = df_w_cough_yamnet['filename'].apply(
    lambda x: os.path.splitext(os.path.basename(x))[0]
)


In [6]:
df_w_cough_yamnet

Unnamed: 0,filename,intervals
0,29241394-8a28-46f5-8b62-a062b4564c5c,"[(98.39999999999999, 99.35999999999999)]"
1,09de6967-b295-4516-8a4d-4d95c9a7b02c,"[(2.88, 3.84), (4.32, 5.76), (6.24, 7.2)]"
2,21c16c1c-46fc-4b80-b941-65d7c6e87555,"[(0.48, 7.2)]"
3,0969d0c4-34ce-4e9a-8cf1-1b18403587e8,"[(2.88, 4.32)]"
4,008c1c9e-aeef-40c5-846c-24f1b964f884,"[(1.92, 3.36), (4.8, 5.76)]"
...,...,...
174,de543d13-541c-4ad7-bb3c-c5c302de3aaf,"[(1.92, 3.84), (4.32, 6.239999999999999)]"
175,dddb1a55-d976-40bc-ad41-cdc713623e03,"[(5.279999999999999, 6.72)]"
176,21aee478-6d13-45ea-be4d-4f29fd244798,"[(1.92, 2.88), (3.36, 4.32), (4.8, 6.239999999..."
177,6647b629-2246-48c9-83ad-c3ad4795c891,"[(2.88, 3.84)]"


In [9]:
file_w_cough_for_yamnet_and_true = list(set(df_w_cough_yamnet["filename"]).intersection(set(df_w_cough_true["filename"]))) # intersection a and b
file_w_cough_for_yamnet_not_true = list(set(df_w_cough_yamnet["filename"]) - set(df_w_cough_true["filename"])) # difference a - b
file_w_cough_for_not_yamnet_but_true = list(set(df_w_cough_true["filename"]) - set(df_w_cough_yamnet["filename"])) # difference b - a
file_yamnet_and_true_union = list(set(df_w_cough_yamnet["filename"]).union(set(df_w_cough_true["filename"]))) # union a and b

In [10]:
len(file_w_cough_for_yamnet_and_true), len(file_w_cough_for_yamnet_not_true), len(file_w_cough_for_not_yamnet_but_true), len(file_yamnet_and_true_union)

(173, 6, 27, 206)

Since there are 200 rows in df_w_cough_true, there are 200 files in the directory that actually contain coughs. Out of the 200, 173 were detected by the yamnet model (86.5%). There were 6 files where yamnet thought there were coughs but they actually did not. There were 27 files where yamnet did not detect coughs even though they were present.

In [49]:
total_cough_count = 0
true_positive = 0
false_positive = 0
false_negative = 0
for filename in file_yamnet_and_true_union:
  try:
    true_intervals = df_w_cough_true[df_w_cough_true['filename'] == filename]['intervals'].values[0]
  except:
    true_intervals = []
  try:
    yamnet_intervals = ast.literal_eval(df_w_cough_yamnet[df_w_cough_yamnet['filename'] == filename]['intervals'].values[0])
  except:
    yamnet_intervals = []

  len_true = len(true_intervals)
  len_yamnet = len(yamnet_intervals)
  if len_true == 0 and len_yamnet > 0: # false positive
    false_positive += len_yamnet
    continue 
  if len_true > 0 and len_yamnet == 0: # false negative
    false_negative += len_true
    total_cough_count += len_true
    continue
  
  true_positive_yamnet_coughs = set()
  for true in true_intervals:
    true_a, true_b = true
    total_cough_count += 1
    caught = False
    for yamnet in yamnet_intervals:
      yamnet_a, yamnet_b = yamnet
      if not (true_b < yamnet_a or true_a > yamnet_b): # check if there's overlap
        true_positive += 1
        true_positive_yamnet_coughs.add(yamnet)
        caught = True
        break 
    if not caught:
      false_negative += 1 # there's true cough but yamnet didn't detect = false negative
  
  for yamnet in yamnet_intervals:
    if yamnet not in true_positive_yamnet_coughs:
      false_positive += 1  # there's cough detected in yamnet but wasn't true = false positive

In [50]:
total_cough_count, true_positive, false_positive, false_negative

(915, 615, 17, 300)

In [52]:
615/(615+300)

0.6721311475409836

$precision = \frac{True Positive}{True Positive + False Positive} = \frac{615}{615+17} = 97.3$

$recall = \frac{True Positive}{True Positive + False Negative} = \frac{615}{615+300} = 67.2$