In [2]:
from pathlib import Path
import re
import pandas as pd

In [3]:
val_list_file = Path('..') / 'data' / 'train' / 'validation_list.txt'
test_list_file = Path('..') / 'data' / 'train' / 'testing_list.txt'

lines = []

with open(val_list_file, 'r') as file:
    for line in file:
        lines.append(re.split(r'[/_\.]', line[:-1]))

val_df = pd.DataFrame(lines, columns=['folder', 'person', 'nohash', 'person_ind', 'format'])
val_df['person_ind'] = val_df['person_ind'].astype(int)

lines = []

with open(test_list_file, 'r') as file:
    for line in file:
        lines.append(re.split(r'[/_\.]', line[:-1]))

test_df = pd.DataFrame(lines, columns=['folder', 'person', 'nohash', 'person_ind', 'format'])
test_df['person_ind'] = test_df['person_ind'].astype(int)

val_df.shape, test_df.shape

((6798, 5), (6835, 5))

In [4]:
val_df.sample(5)

Unnamed: 0,folder,person,nohash,person_ind,format
2887,nine,9db2bfe9,nohash,1,wav
4136,right,97ae8b25,nohash,1,wav
1329,five,9db2bfe9,nohash,4,wav
812,down,7eee5973,nohash,0,wav
3334,off,5fadb538,nohash,2,wav


In [5]:
val_df.nunique().sort_values()

nohash          1
format          1
person_ind      7
folder         30
person        189
dtype: int64

In [6]:
test_df.nunique().sort_values()

nohash          1
format          1
person_ind      8
folder         30
person        189
dtype: int64

In [7]:
from pathlib import Path

# Create a Path object for the root folder
root_folder = Path('..') / 'data' / 'train' / 'audio'

lines = []

for path in root_folder.rglob('*'):
    if '_background_noise_' not in path.parts and path.is_file():
        if len(re.split(r'[\\/_\.]', str(path.relative_to(root_folder)))) > 5:
            print(re.split(r'[\\/_\.]', str(path.relative_to(root_folder))))
        lines.append(re.split(r'[\\/_\.]', str(path.relative_to(root_folder))))

# # Use the rglob method to find all files in the folder and its subfolders
# file_paths = [str(path.relative_to(root_folder)) for path in root_folder.rglob('*') if path.is_file()]

# # Now file_paths is a list of all file paths, starting from the directory name inside 'root_folder'
# for path in file_paths:
#     print(path)
    
# lines = []

# with open(test_list_file, 'r') as file:
#     for line in file:
#         lines.append(re.split(r'[/_\.]', line[:-1]))

# print(lines)

full_df = pd.DataFrame(lines, columns=['folder', 'person', 'nohash', 'person_ind', 'format'])
full_df.shape

(64721, 5)

In [8]:
full_df['person_ind'] = full_df['person_ind'].astype(int)

In [9]:
full_df.nunique().sort_values()

nohash           1
format           1
person_ind      12
folder          30
person        1881
dtype: int64

In [10]:
train_df = pd.concat([full_df, val_df, test_df]).drop_duplicates(keep=False)
assert len(full_df) == len(train_df) + len(val_df) + len(test_df)
train_df.shape, val_df.shape, test_df.shape, full_df.shape

((51088, 5), (6798, 5), (6835, 5), (64721, 5))

In [11]:
assert full_df['nohash'].unique() == ['nohash']
assert full_df['format'].unique() == ['wav']

In [12]:
train_df.drop(columns=['nohash', 'format'], inplace=True)
val_df.drop(columns=['nohash', 'format'], inplace=True)
test_df.drop(columns=['nohash', 'format'], inplace=True)
full_df.drop(columns=['nohash', 'format'], inplace=True)

In [13]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64721 entries, 0 to 64720
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   folder      64721 non-null  object
 1   person      64721 non-null  object
 2   person_ind  64721 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 1.2+ MB


In [14]:
assert set(val_df['folder']) == set(test_df['folder'])
assert set(val_df['folder']) == set(full_df['folder'])

In [15]:
assert set(val_df['person']) & set(test_df['person']) == set()
assert set(val_df['person']) & set(train_df['person']) == set()
assert set(test_df['person']) & set(train_df['person']) == set()

# Full investigation

In [16]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64721 entries, 0 to 64720
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   folder      64721 non-null  object
 1   person      64721 non-null  object
 2   person_ind  64721 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 1.2+ MB


In [17]:
full_df.groupby(by='folder').count()['person'].sort_values()

folder
bed       1713
bird      1731
tree      1733
cat       1733
sheila    1734
happy     1742
wow       1745
dog       1746
marvin    1746
house     1750
eight     2352
left      2353
three     2356
five      2357
off       2357
down      2359
nine      2364
on        2367
right     2367
six       2369
one       2370
four      2372
go        2372
two       2373
up        2375
no        2375
zero      2376
seven     2377
yes       2377
stop      2380
Name: person, dtype: int64

In [18]:
folders_aux = set(full_df.groupby(by='folder').count()['person'][full_df.groupby(by='folder').count()['person'] < 2000].index)
print(len(folders_aux), folders_aux)
folders_main = set(full_df.groupby(by='folder').count()['person'][full_df.groupby(by='folder').count()['person'] > 2000].index)
print(len(folders_main), folders_main)

10 {'happy', 'cat', 'marvin', 'bird', 'house', 'dog', 'wow', 'sheila', 'tree', 'bed'}
20 {'on', 'four', 'yes', 'nine', 'three', 'no', 'stop', 'five', 'right', 'down', 'one', 'eight', 'off', 'zero', 'two', 'up', 'six', 'go', 'left', 'seven'}


In [19]:
full_aux_df = full_df[full_df['folder'].isin(folders_aux)]
full_main_df = full_df[full_df['folder'].isin(folders_main)]
full_df.shape, full_main_df.shape, full_aux_df.shape 

((64721, 3), (47348, 3), (17373, 3))

In [20]:
full_df['person'].nunique(), full_aux_df['person'].nunique(), full_main_df['person'].nunique()

(1881, 1836, 1868)

In [21]:
full_aux_df.groupby(by='folder').nunique()['person']

folder
bed       1177
bird      1194
cat       1180
dog       1209
happy     1178
house     1165
marvin    1191
sheila    1178
tree      1173
wow       1181
Name: person, dtype: int64

In [22]:
full_main_df.groupby(by='folder').nunique()['person']

folder
down     1206
eight    1179
five     1181
four     1194
go       1182
left     1180
nine     1182
no       1203
off      1170
on       1198
one      1179
right    1194
seven    1192
six      1197
stop     1191
three    1202
two      1175
up       1186
yes      1200
zero     1186
Name: person, dtype: int64

In [23]:
pd.DataFrame({'full': full_df['person_ind'].describe(),
              'aux': full_aux_df['person_ind'].describe(),
              'main': full_main_df['person_ind'].describe()})

Unnamed: 0,full,aux,main
count,64721.0,17373.0,47348.0
mean,0.832697,0.417142,0.985174
std,1.189834,0.695201,1.29276
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,1.0,1.0,2.0
max,11.0,6.0,11.0


In [24]:
train_main_df = train_df[train_df['folder'].isin(folders_main)]
train_df.shape, train_main_df.shape

((51088, 3), (37158, 3))

# Build list of files without auxiliry of train, val, test

In [3]:
val_list_file = Path('..') / 'data' / 'train' / 'validation_list.txt'
test_list_file = Path('..') / 'data' / 'train' / 'testing_list.txt'

with open(val_list_file, 'r') as file:
    set_val_all = set(line.strip().replace('/', '\\') for line in file)
with open(test_list_file, 'r') as file:
    set_test_all = set(line.strip().replace('/', '\\') for line in file)

    
root_folder = Path('..') / 'data' / 'train' / 'audio'

lines = []

set_all = set()

for path in root_folder.rglob('*'):
    if '_background_noise_' not in path.parts and path.is_file():
        set_all.add(str(path.relative_to(root_folder)))
        
set_train_all = set_all - set_val_all - set_test_all
        
len(set_all), len(set_val_all), len(set_test_all), len(set_train_all)

(64721, 6798, 6835, 51088)

In [4]:
folders_main = ['on', 'four', 'yes', 'nine', 'three', 'no', 'stop', 'five', 'right', 'down', 'one', 'eight', 'off', 'zero', 'two', 'up', 'six', 'go', 'left', 'seven']
print(folders_main)

['on', 'four', 'yes', 'nine', 'three', 'no', 'stop', 'five', 'right', 'down', 'one', 'eight', 'off', 'zero', 'two', 'up', 'six', 'go', 'left', 'seven']


In [7]:
set_not_main = set()
folders_main_ser = pd.Series(list(folders_main))

for path in root_folder.rglob('*'):
    if not folders_main_ser.isin(path.parts).any() and path.is_file():
        set_not_main.add(str(path.relative_to(root_folder)))
        
set_val_main = set_val_all - set_not_main
set_test_main = set_test_all - set_not_main
set_train_main = set_train_all - set_not_main
        
len(set_train_main), len(set_val_main), len(set_test_main), len(set_not_main), len(set_all)

(37158, 5071, 5119, 17380, 64721)