# Data preparation

## Remove duplicates

In [16]:
import importlib
import sys
from utils import *
importlib.reload(sys.modules['utils'])
from utils import *

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

In [5]:
df = pd.read_csv("../csv/moderation_audio.csv")

In [7]:
print("Number of duplicates:", len(df[df.duplicated()]))

Number of duplicates: 37


In [8]:
df.drop_duplicates("media_url", inplace=True)

In [9]:
df.to_csv("../csv/filtered_audio.csv", index=False)

## Download dataset

In [12]:
Path("../dataset").mkdir(exist_ok=True)
Path("../wavs").mkdir(exist_ok=True)
Path("../wav2vec").mkdir(exist_ok=True)
Path("../vgg").mkdir(exist_ok=True)

In [None]:
download_original_dataset(
    df['media_url'].tolist(),
    '../dataset'
)

## Convert to 16KHz wav

In [14]:
from tqdm import tqdm

In [None]:
for url in tqdm(df.media_url.tolist()):
    src_path = url_to_path(
        url = url,
        base_dir = "../dataset",
        ext = "mp4"
    )

    dst_path = url_to_path(
        url = url,
        base_dir = "../wavs",
        ext = "wav"
    )

    convert_mp4_to_wav(
        src_path,
        dst_path
    )

## Remove empty files and duplicated md5

In [17]:
urls = df.media_url.tolist()
paths = [url_to_path(url, "../wavs", "wav") for url in urls]
md5_hashs = [md5(path) for path in paths]
not_zero = [np.any(open_wav(path)) for path in paths]

In [19]:
df['path'] = paths
df['md5'] = md5_hashs
df['not_zero'] = not_zero

In [23]:
print(f"Duplilated files on md5: {len(df[df.duplicated('md5')])}")
print(f"Empty files: {len(df[df['not_zero'] == False])}")

Duplilated files on md5: 167
Empty files: 189


In [25]:
df = df[df['not_zero']]
df = df.drop_duplicates("md5")

In [26]:
df.to_csv("../csv/filtered_audio.csv", index=False)

## Check correct filtering

In [28]:
df = pd.read_csv("../csv/filtered_audio.csv")
df_bad_urls = df[df['tango_decision'] == "BAD"].media_url.tolist()
df_original = pd.read_csv("../csv/moderation_audio.csv")
original_bad_urls = df_original[df_original['tango_decision'] == "BAD"].media_url.tolist()

df_bad_urls = set(sorted(df_bad_urls))
original_bad_urls = set(sorted(original_bad_urls))

In [29]:
print(f"Len bad files after filtering: {len(df_bad_urls)}")
print(f"Len bad original files: {len(original_bad_urls)}")

Len bad files after filtering: 634
Len bad original files: 634


## Incorrect files

In [30]:
#http://XXX/NEaAY2ic - bad, but just music
#http://XXX/jDsHpMuM - bad, but too large