### preparing voice data clips for voice unit
* use cv-corpus-12.0-2022-12-07 emglish collection
* remove those speakers (client_id) that gender = 'other', OR have any down_votes > 0
* even those that have clips with down_votes = 0, as long as the speaker even has one clip that received down_votes
* in addition, for the younger speakers group, remove those speakers with less than 6 clips recorded
* this reduced data volume from 90 GB to 2GB, speakers from 33806 to 1123, clips from 986571 to 15006

In [1]:
import pandas as pd
import numpy as np

In [2]:
cv_list = pd.read_csv('/Users/yingli/Downloads/cv-corpus-12.0-2022-12-07/en/train.tsv', sep='\t', \
    dtype = {'age':str, 'gender':str})

In [3]:
cv_list.nunique()

client_id      33806
path          986571
sentence      986571
up_votes          23
down_votes        10
age                9
gender             3
accents            0
locale             1
segment            0
dtype: int64

In [3]:
cv_list.drop(labels = ["accents", "segment", "locale"],axis =1,inplace=True)
cv_list.dropna(inplace = True)

In [4]:
cv_list.count()

client_id     698658
path          698658
sentence      698658
up_votes      698658
down_votes    698658
age           698658
gender        698658
dtype: int64

In [5]:
remove1 = cv_list.query("down_votes > 0 or gender == 'other'").client_id.drop_duplicates()
df_merge = cv_list.merge(remove1, how = "outer",indicator = True, on="client_id")
df1 = df_merge.loc[df_merge["_merge"] == "left_only"].drop("_merge",axis =1)
df1.groupby(["age","gender"]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,client_id,path,sentence,up_votes,down_votes
age,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
eighties,female,2,55,55,2,1
eighties,male,5,34,34,4,1
fifties,female,76,496,496,5,1
fifties,male,161,1015,1015,6,1
fourties,female,100,749,749,7,1
fourties,male,328,2038,2038,5,1
seventies,female,21,190,190,5,1
seventies,male,46,288,288,6,1
sixties,female,57,601,601,5,1
sixties,male,101,676,676,4,1


In [6]:
cv_list.shape, df1.shape

((698658, 7), (22155, 7))

In [7]:
clients = cv_list.groupby(["client_id","age","gender"]).agg(
    up_vote_max = ('up_votes', 'max'),
    up_vote_min = ('up_votes', 'min'),
    down_vote_max = ('down_votes', 'max'),
    down_vote_min = ('down_votes', 'min'),
    num_up_vote = ('up_votes', 'nunique'),
    num_down_vote = ('down_votes', 'nunique'),
    num_clip = ('path', 'count')
).reset_index()

In [8]:
remove2 = clients.query("age != 'eighties' and age !='seventies' and num_clip <= 5").client_id.drop_duplicates()
df_merge = df1.merge(remove2, how = "outer",indicator = True, on="client_id")
df2 = df_merge.loc[df_merge["_merge"] == "left_only"].drop("_merge",axis =1)
df2.groupby(["age","gender"]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,client_id,path,sentence,up_votes,down_votes
age,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
eighties,female,2,55,55,2,1
eighties,male,5,34,34,4,1
fifties,female,32,351,351,4,1
fifties,male,57,647,647,6,1
fourties,female,45,560,560,7,1
fourties,male,100,1286,1286,5,1
seventies,female,21,190,190,5,1
seventies,male,46,288,288,6,1
sixties,female,23,464,464,5,1
sixties,male,38,448,448,4,1


In [9]:
cv_list.shape,df1.shape,df2.shape

((698658, 7), (22155, 7), (15006, 7))

In [10]:
df2.path.nunique()/cv_list.shape[0]*90 # whole collection is about 90 GB in size

1.9330487878189329

In [11]:
df2.groupby(["client_id", "age","gender"]).agg(
    clip_num = ('path', 'nunique')
).reset_index().sort_values(by="clip_num", ascending=False)

Unnamed: 0,client_id,age,gender,clip_num
370,55e8921a6b0968107dcc6bad3e7cd2ab74b27ad0520581...,thirties,male,261
993,e5ec4ff8c5f24fd6cf0431dbc29fbf29ec565ea8c4455e...,thirties,female,134
270,40d9ed81ec9cd287017629cbe8e68a99eacec68020e3a2...,sixties,female,116
306,483ea02dad41028e327f5eb09a79faaa2f38eccad6b6ef...,thirties,male,105
192,2b8b16dd2a58a785a955c39a7d2377a8a7c7c68d17dbf1...,teens,male,97
...,...,...,...,...
972,e1a503f6d1601e5c5cd12426e1efde44a671ed1137d8eb...,seventies,female,1
768,b0fe0110971f06d859b66254cf6e7c73dbe1d484dbc5eb...,seventies,male,1
916,d40ae1a4614a236ffd9ccd55fd0af7eed9974b13756d18...,seventies,male,1
924,d5ed376f34311e655c695f1bbb2eb909375b2a34d3257e...,seventies,female,1


In [12]:
df2.query("age == 'twenties' and gender == 'male'").groupby("client_id").up_votes.max().sort_values(ascending=False)


client_id
003199b017d556d7b9d5482c09f0889c8ca821a59ad396e3a7402ee9c941a76e4dd868c5a2e7d85ce6c49a45f60c3daaeb88156f10b6cab03ff829b781ad410d    16.0
6817e481dcca14382ea4b7d2b9ca94e9f103aa982eb772ed0b761c6081e9719afed25fef7e349c43737328861084b88a555ef0853fa7bcfd2539c405c463a85e    12.0
8308864c88b6df6ff45d803952f36b1c39b6835d7a71334405f1b340d2db9257edf52bd405807278f6dca161e01c79027982efeb44e62533a551c1b6f3ac8b25    12.0
f933a62114e4da82128c41922ca8a1d6198c4cd1ff391d382d8002da33a65fcca653b55e8cb207b4682e8a3a3525ca239af9b769d54e2f31996ee9f5066b4079    10.0
3b97dd5c9db498daa7f62739105a62eeab8abd29b2765e3055ea9aed55f7bd85a0e7a4a5a61ffd24b32b88445c2fffa311b5a622fcd00166f84101306ec2134f     8.0
                                                                                                                                    ... 
68f1a3b82923bce610e0162d8c8dce1f985eb68d5b5778013117477413f659fb8b78bcad7125d9e9bb4d6d0754d3bc03b7fcaac54435dcf63f977301691ab7ca     2.0
6905fe4758eff3c02593cb6384fe8a9

In [13]:
df2.nunique()

client_id      1123
path          15006
sentence      15006
up_votes         11
down_votes        1
age               8
gender            2
dtype: int64

In [15]:
df2.sentence.apply(lambda x: len(x)).describe()

count    15006.000000
mean        60.350993
std         19.471146
min          4.000000
25%         46.000000
50%         61.000000
75%         75.000000
max        169.000000
Name: sentence, dtype: float64

In [16]:
df2.groupby("client_id").count().reset_index().sort_values( by = "up_votes", ascending = False).head(20)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender
370,55e8921a6b0968107dcc6bad3e7cd2ab74b27ad0520581...,261,261,261,261,261,261
992,e5ec4ff8c5f24fd6cf0431dbc29fbf29ec565ea8c4455e...,134,134,134,134,134,134
270,40d9ed81ec9cd287017629cbe8e68a99eacec68020e3a2...,116,116,116,116,116,116
306,483ea02dad41028e327f5eb09a79faaa2f38eccad6b6ef...,105,105,105,105,105,105
192,2b8b16dd2a58a785a955c39a7d2377a8a7c7c68d17dbf1...,97,97,97,97,97,97
699,a33afddc9bfab2d5710efcb0a162d6e8e2ed092be3a59e...,94,94,94,94,94,94
128,1a7affcdbc38b0b348af446d34fb2b8796fee59b7346f8...,87,87,87,87,87,87
9,00dd747cffb223ea3095abc625f16e689d74e3c4c4871f...,87,87,87,87,87,87
343,50756554519563411e77bf15f3c85f2e15e134aecf1204...,84,84,84,84,84,84
1045,efcec6355fdb427f75173123b8865bdfa9f3f5949080cc...,67,67,67,67,67,67


In [17]:
df2.nunique()

client_id      1123
path          15006
sentence      15006
up_votes         11
down_votes        1
age               8
gender            2
dtype: int64

In [18]:
df2.path.to_csv("voice_clips_list.csv", header=True,index=False)
#df2.to_csv("voice_clips_meta_data.csv", index=False)