# Evaluating Yamnet Segmentation Model using data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking dataset

In [1]:
import pandas as pd
import os
import ast

In [2]:
audio_directory = os.path.expanduser("~/AC297r/CoughAnalyzer/data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking/")

Get the true cough labels for audio files that do have coughs (files with txt files)

In [3]:
df_w_cough_true = pd.DataFrame(columns=['filename', 'intervals'])
i = 0

for root, dirs, files in os.walk(audio_directory):
    for file in files:
        if file.endswith('.txt'): #change to '.wav' to analyze the CoughSegmentation dataset
            audio_filename = os.path.join(root, file)
            interval_lst = []
            with open(audio_filename, 'r') as f:
              for line in f:
                if line.strip(): # skip empty lines
                  parts = line.strip().split()
                  interval_lst.append([float(parts[0]), float(parts[1])])
            df_w_cough_true.loc[i] = [audio_filename, interval_lst]
            i += 1
df_w_cough_true['filename'] = df_w_cough_true['filename'].apply(
    lambda x: os.path.splitext(os.path.basename(x))[0]
)

In [4]:
df_w_cough_true

Unnamed: 0,filename,intervals
0,1dd3b212-e969-4ede-a9d9-f24b711e2028,"[[0.782855, 1.01326], [1.198501, 1.388324], [2..."
1,7ae1ffe1-2259-411f-8ead-6c107e01e824,"[[1.520353, 2.373774], [4.211789, 4.555528], [..."
2,a4cc4680-8bb6-4646-b9cf-d77a4e8ada21,"[[1.778078, 2.284206], [2.301887, 2.655514], [..."
3,620ded24-220f-4ada-b032-2b5c170b279a,"[[1.349212, 1.763799], [1.807962, 2.026071], [..."
4,99d322ed-c367-4d45-b6d1-f008b47f1af9,"[[0.54401, 1.868028], [1.868028, 2.378799]]"
...,...,...
195,f5a661dc-8161-4842-b1c3-cd7265896101,"[[0.571248, 1.402015], [1.939479, 2.701144], [..."
196,98fb6294-d339-4c83-8ff5-2bbcf82e35e0,"[[0.759539, 2.415583], [2.473171, 2.875249], [..."
197,6dff10f3-5df0-4b5a-be62-613afd6115dc,"[[3.812107, 4.398915], [5.05404, 5.788743], [5..."
198,25b750e5-8a76-4c13-9fd8-851e45d1b5ed,"[[2.450166, 3.222867], [4.159943, 4.705751], [..."


## Result when we have chosen top3 label cutoff in the yamnet model

the intervals of coughs detected by yamnet were found using yamnet_evaluation_cough_intevals.py

In [6]:
df_w_cough_yamnet = pd.read_csv('data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking_yamnet_cough_intervals_top3.csv')
df_w_cough_yamnet['filename'] = df_w_cough_yamnet['filename'].apply(
    lambda x: os.path.splitext(os.path.basename(x))[0]
)

In [7]:
df_w_cough_yamnet

Unnamed: 0,filename,intervals
0,29241394-8a28-46f5-8b62-a062b4564c5c,"[(98.39999999999999, 99.35999999999999)]"
1,09de6967-b295-4516-8a4d-4d95c9a7b02c,"[(2.88, 3.84), (4.32, 5.76), (6.24, 7.2)]"
2,21c16c1c-46fc-4b80-b941-65d7c6e87555,"[(0.48, 7.2)]"
3,0969d0c4-34ce-4e9a-8cf1-1b18403587e8,"[(2.88, 4.32)]"
4,008c1c9e-aeef-40c5-846c-24f1b964f884,"[(1.92, 3.36), (4.8, 5.76)]"
...,...,...
174,de543d13-541c-4ad7-bb3c-c5c302de3aaf,"[(1.92, 3.84), (4.32, 6.239999999999999)]"
175,dddb1a55-d976-40bc-ad41-cdc713623e03,"[(5.279999999999999, 6.72)]"
176,21aee478-6d13-45ea-be4d-4f29fd244798,"[(1.92, 2.88), (3.36, 4.32), (4.8, 6.239999999..."
177,6647b629-2246-48c9-83ad-c3ad4795c891,"[(2.88, 3.84)]"


In [8]:
file_w_cough_for_yamnet_and_true = list(set(df_w_cough_yamnet["filename"]).intersection(set(df_w_cough_true["filename"]))) # intersection a and b
file_w_cough_for_yamnet_not_true = list(set(df_w_cough_yamnet["filename"]) - set(df_w_cough_true["filename"])) # difference a - b
file_w_cough_for_not_yamnet_but_true = list(set(df_w_cough_true["filename"]) - set(df_w_cough_yamnet["filename"])) # difference b - a
file_yamnet_and_true_union = list(set(df_w_cough_yamnet["filename"]).union(set(df_w_cough_true["filename"]))) # union a and b

In [9]:
len(file_w_cough_for_yamnet_and_true), len(file_w_cough_for_yamnet_not_true), len(file_w_cough_for_not_yamnet_but_true), len(file_yamnet_and_true_union)

(173, 6, 27, 206)

Since there are 200 rows in df_w_cough_true, there are 200 files in the directory that actually contain coughs. Out of the 200, 173 were detected by the yamnet model (86.5%). There were 6 files where yamnet thought there were coughs but they actually did not. There were 27 files where yamnet did not detect coughs even though they were present.

In [10]:
total_cough_count = 0
true_positive = 0
false_positive = 0
false_negative = 0
for filename in file_yamnet_and_true_union:
  try:
    true_intervals = df_w_cough_true[df_w_cough_true['filename'] == filename]['intervals'].values[0]
  except:
    true_intervals = []
  try:
    yamnet_intervals = ast.literal_eval(df_w_cough_yamnet[df_w_cough_yamnet['filename'] == filename]['intervals'].values[0])
  except:
    yamnet_intervals = []

  len_true = len(true_intervals)
  len_yamnet = len(yamnet_intervals)
  if len_true == 0 and len_yamnet > 0: # false positive
    false_positive += len_yamnet
    continue 
  if len_true > 0 and len_yamnet == 0: # false negative
    false_negative += len_true
    total_cough_count += len_true
    continue
  
  true_positive_yamnet_coughs = set()
  for true in true_intervals:
    true_a, true_b = true
    total_cough_count += 1
    caught = False
    for yamnet in yamnet_intervals:
      yamnet_a, yamnet_b = yamnet
      if not (true_b < yamnet_a or true_a > yamnet_b): # check if there's overlap
        true_positive += 1
        true_positive_yamnet_coughs.add(yamnet)
        caught = True
        break 
    if not caught:
      false_negative += 1 # there's true cough but yamnet didn't detect = false negative
  
  for yamnet in yamnet_intervals:
    if yamnet not in true_positive_yamnet_coughs:
      false_positive += 1  # there's cough detected in yamnet but wasn't true = false positive

In [11]:
total_cough_count, true_positive, false_positive, false_negative

(915, 615, 17, 300)

In [12]:
615/(615+300)

0.6721311475409836

$precision = \frac{True Positive}{True Positive + False Positive} = \frac{615}{615+17} = 97.3$

$recall = \frac{True Positive}{True Positive + False Negative} = \frac{615}{615+300} = 67.2$

## Hyperparameter tune the top_n

because we are using a pretrained model, there is no training so there is no need for train, test, validation. We just need validation and test such that we use the validation, a small portion of the data, to check which hyperparameter top_n is best, and then see the results of the model with the best top_n on the test data. 

split all audio files to test and validation

In [5]:
all_audio_filenames = []

for root, dirs, files in os.walk(audio_directory):
    for file in files:
        if file.endswith('.wav'): #change to '.wav' to analyze the CoughSegmentation dataset
            audio_filename = os.path.join(root, file)
            all_audio_filenames.append(os.path.splitext(os.path.basename(audio_filename))[0])

In [6]:
all_audio_label = []

for audio_filename in all_audio_filenames:
  if audio_filename in df_w_cough_true['filename'].values:
    all_audio_label.append(1) # there is coughs
  else:
    all_audio_label.append(0) # there is no cough

In [7]:
from sklearn.model_selection import train_test_split

val_audio_filenames, test_audio_filenames, val_audio_label, test_audio_label = train_test_split(all_audio_filenames, all_audio_label, test_size=0.8, random_state=42, stratify=all_audio_label)

Let's see which model performs best on the validation.

In [8]:
def evaluate_model(audio_filenames):
  df_w_cough_true_temp = df_w_cough_true[df_w_cough_true['filename'].isin(audio_filenames)]
  df_results = pd.DataFrame(columns=['top_n', 'total_cough_count', 'true_positive', 'false_positive', 'false_negative', 'precision', 'recall'])

  for top_n in range(1, 11):
    df_w_cough_yamnet = pd.read_csv(f'data_w_noise_white0.03_nbn10_nbd0.5_w-beeping_w-talking_yamnet_cough_intervals_top{top_n}.csv')
    df_w_cough_yamnet['filename'] = df_w_cough_yamnet['filename'].apply(
        lambda x: os.path.splitext(os.path.basename(x))[0]
    )
    df_w_cough_yamnet_temp = df_w_cough_yamnet[df_w_cough_yamnet['filename'].isin(audio_filenames)]

    file_yamnet_and_true_union_temp = list(set(df_w_cough_yamnet_temp["filename"]).union(set(df_w_cough_true_temp["filename"]))) # union a and b

    total_cough_count = 0
    true_positive = 0
    false_positive = 0
    false_negative = 0
    for filename in file_yamnet_and_true_union_temp:
      try:
        true_intervals = df_w_cough_true_temp[df_w_cough_true_temp['filename'] == filename]['intervals'].values[0]
      except:
        true_intervals = []
      try:
        yamnet_intervals = ast.literal_eval(df_w_cough_yamnet_temp[df_w_cough_yamnet_temp['filename'] == filename]['intervals'].values[0])
      except:
        yamnet_intervals = []

      len_true = len(true_intervals)
      len_yamnet = len(yamnet_intervals)
      if len_true == 0 and len_yamnet > 0: # false positive
        false_positive += len_yamnet
        continue 
      if len_true > 0 and len_yamnet == 0: # false negative
        false_negative += len_true
        total_cough_count += len_true
        continue
      
      true_positive_yamnet_coughs = set()
      for true in true_intervals:
        true_a, true_b = true
        total_cough_count += 1
        caught = False
        for yamnet in yamnet_intervals:
          yamnet_a, yamnet_b = yamnet
          if not (true_b < yamnet_a or true_a > yamnet_b): # check if there's overlap
            true_positive += 1
            true_positive_yamnet_coughs.add(yamnet)
            caught = True
            break 
        if not caught:
          false_negative += 1 # there's true cough but yamnet didn't detect = false negative
      
      for yamnet in yamnet_intervals:
        if yamnet not in true_positive_yamnet_coughs:
          false_positive += 1  # there's cough detected in yamnet but wasn't true = false positive
    
    precision = true_positive/(true_positive+false_positive)
    recall = true_positive/(true_positive+false_negative)

    df_results.loc[top_n-1] = [top_n, total_cough_count, true_positive, false_positive, false_negative, precision, recall]
  return df_results

In [9]:
df_val_results = evaluate_model(val_audio_filenames)
df_val_results

Unnamed: 0,top_n,total_cough_count,true_positive,false_positive,false_negative,precision,recall
0,1.0,170.0,86.0,3.0,84.0,0.966292,0.505882
1,2.0,170.0,98.0,5.0,72.0,0.951456,0.576471
2,3.0,170.0,103.0,5.0,67.0,0.953704,0.605882
3,4.0,170.0,104.0,6.0,66.0,0.945455,0.611765
4,5.0,170.0,105.0,8.0,65.0,0.929204,0.617647
5,6.0,170.0,107.0,8.0,63.0,0.930435,0.629412
6,7.0,170.0,110.0,7.0,60.0,0.940171,0.647059
7,8.0,170.0,112.0,9.0,58.0,0.92562,0.658824
8,9.0,170.0,115.0,9.0,55.0,0.927419,0.676471
9,10.0,170.0,117.0,10.0,53.0,0.92126,0.688235


This is subjective but the best balance of precision to recall seems to be top_n = 7

So now let's see the scores on the test set and all of the files

In [10]:
best_top_n = 7

df_test_results = evaluate_model(test_audio_filenames)
df_test_results

Unnamed: 0,top_n,total_cough_count,true_positive,false_positive,false_negative,precision,recall
0,1.0,745.0,445.0,6.0,300.0,0.986696,0.597315
1,2.0,745.0,489.0,8.0,256.0,0.983903,0.656376
2,3.0,745.0,512.0,12.0,233.0,0.977099,0.687248
3,4.0,745.0,524.0,15.0,221.0,0.972171,0.703356
4,5.0,745.0,531.0,15.0,214.0,0.972527,0.712752
5,6.0,745.0,543.0,16.0,202.0,0.971377,0.728859
6,7.0,745.0,550.0,18.0,195.0,0.96831,0.738255
7,8.0,745.0,553.0,21.0,192.0,0.963415,0.742282
8,9.0,745.0,563.0,25.0,182.0,0.957483,0.755705
9,10.0,745.0,575.0,27.0,170.0,0.95515,0.771812


In [11]:
df_test_results[df_test_results['top_n'] == best_top_n]

Unnamed: 0,top_n,total_cough_count,true_positive,false_positive,false_negative,precision,recall
6,7.0,745.0,550.0,18.0,195.0,0.96831,0.738255


In [12]:
df_all_results = evaluate_model(all_audio_filenames)
df_all_results

Unnamed: 0,top_n,total_cough_count,true_positive,false_positive,false_negative,precision,recall
0,1.0,915.0,531.0,9.0,384.0,0.983333,0.580328
1,2.0,915.0,587.0,13.0,328.0,0.978333,0.64153
2,3.0,915.0,615.0,17.0,300.0,0.973101,0.672131
3,4.0,915.0,628.0,21.0,287.0,0.967643,0.686339
4,5.0,915.0,636.0,23.0,279.0,0.965099,0.695082
5,6.0,915.0,650.0,24.0,265.0,0.964392,0.710383
6,7.0,915.0,660.0,25.0,255.0,0.963504,0.721311
7,8.0,915.0,665.0,30.0,250.0,0.956835,0.726776
8,9.0,915.0,678.0,34.0,237.0,0.952247,0.740984
9,10.0,915.0,692.0,37.0,223.0,0.949246,0.756284


In [13]:
df_all_results[df_all_results['top_n'] == best_top_n]

Unnamed: 0,top_n,total_cough_count,true_positive,false_positive,false_negative,precision,recall
6,7.0,915.0,660.0,25.0,255.0,0.963504,0.721311
