In [80]:
# Basic libraries
import os, sys
import numpy as np
# import scripts.lib as lib

# Speech
import soundfile as sf # pip install pysoundfile
import python_speech_features as speech_lib # pip install python_speech_features

# Machine learning
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
# import tensorflow as tf
from sklearn.metrics import classification_report

import pickle

## Get Meta

In [6]:
speakers_file = '/datasets01/librispeech/062419/SPEAKERS.TXT'

with open(speakers_file) as f:
    content = f.readlines() 

In [7]:
id_speaker = np.array([], dtype=int)
gender_speaker = np.array([], dtype=int)
for line in content:
    if 'train-clean-' in line:
        id_speaker = np.append(id_speaker, int(line.split('|')[0]))
        if 'F' in line.split('|')[1]:
            gender_speaker = np.append(gender_speaker,0)
        elif 'M' in line.split('|')[1]:
            gender_speaker = np.append(gender_speaker,1)

gender_speaker = np.asarray(gender_speaker)
id_speaker = np.asarray(id_speaker)
train_metadata = np.concatenate([np.expand_dims(id_speaker, axis=1).T, np.expand_dims(gender_speaker,axis=1).T]).T

print(train_metadata.shape)
for row  in train_metadata[:10]:
    print("Speaker id : {:5d}, gender class : {:d}".format(row[0],row[1]))

(1172, 2)
Speaker id :    14, gender class : 0
Speaker id :    16, gender class : 0
Speaker id :    17, gender class : 1
Speaker id :    19, gender class : 0
Speaker id :    22, gender class : 0
Speaker id :    23, gender class : 0
Speaker id :    26, gender class : 1
Speaker id :    27, gender class : 1
Speaker id :    28, gender class : 0
Speaker id :    30, gender class : 0


In [8]:
id_speaker = np.array([], dtype=int)
gender_speaker = np.array([], dtype=int)
for line in content:
    if 'dev-' in line or 'test-' in line:
        id_speaker = np.append(id_speaker, int(line.split('|')[0]))
        if 'F' in line.split('|')[1]:
            gender_speaker = np.append(gender_speaker,0)
        elif 'M' in line.split('|')[1]:
            gender_speaker = np.append(gender_speaker,1)

gender_speaker = np.asarray(gender_speaker)
id_speaker = np.asarray(id_speaker)
test_metadata = np.concatenate([np.expand_dims(id_speaker, axis=1).T, np.expand_dims(gender_speaker,axis=1).T]).T

print(test_metadata.shape)
for row  in test_metadata[:10]:
    print("Speaker id : {:5d}, gender class : {:d}".format(row[0],row[1]))

(146, 2)
Speaker id :    61, gender class : 1
Speaker id :    84, gender class : 0
Speaker id :   116, gender class : 1
Speaker id :   121, gender class : 0
Speaker id :   174, gender class : 1
Speaker id :   237, gender class : 0
Speaker id :   251, gender class : 1
Speaker id :   260, gender class : 1
Speaker id :   367, gender class : 0
Speaker id :   422, gender class : 1


In [9]:
sum(test_metadata)

array([614269,     73])

## Load train features

In [44]:
data_root = '/datasets01/librispeech/062419/'

N_FEAT = 40
TYPE_FEAT = 'mfsc'

def get_folder(speaker_id):
    folders = ['train-other-500', 'train-clean-360', 'train-clean-100', 'dev-clean', 'dev-other', 'test-clean', 'test-other']
    for f in folders:
        speaker_folder = os.path.join(data_root, f, speaker_id)
        if os.path.exists(speaker_folder):
            return speaker_folder

def create_dataset(metadata, feat_type='mfsc', nfeat=13):
    """Creates dataset from metadata with format [speaker_id, speaker_gendeer]"""
    
    dataset = np.ndarray(shape=(0,nfeat))
    gender_vector = np.ndarray(shape=(0,1))
    for speaker_id, gender in metadata:
        speaker_folder = get_folder(str(int(speaker_id)))
        for root, dirs, files in os.walk(speaker_folder):
            for name in files:
                if name.endswith(".flac"):
                    filepath = os.path.join(root,name)
                    with open(filepath, 'rb') as f:
                    
                        signal, samplerate = sf.read(f)

                        if feat_type == 'mfcc':
                            feat = speech_lib.mfcc(signal,samplerate,winlen=0.060,winstep=0.03,numcep=nfeat,nfilt=nfeat,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=False)
                        elif feat_type == 'mfsc':
                            feat = speech_lib.logfbank(signal, samplerate, nfilt=nfeat)
                        else:
                            print("Unknow feature type", feat_type)
                            exit(0)
                            
                        mean_feat = np.expand_dims(np.mean(feat,axis=0),axis=1).T
                        dataset = np.append(dataset,mean_feat,axis=0)
                        gender_vector = np.append(gender_vector,np.expand_dims(gender*np.ones(mean_feat.shape[0]),axis=1),axis=0)
                        
    return dataset, gender_vector

In [11]:
x_mfsc, y_mfsc = create_dataset(train_metadata, TYPE_FEAT, N_FEAT)

In [13]:
print(x_mfsc.shape)
print(y_mfsc.shape)

(132553, 40)
(132553, 1)


In [45]:
def normalize_data(train_data):
    """Normalize training data to have mean 0 and variance 1, then apply the same treatment to test data"""

    means = np.mean(train_data,axis=0)
    stds = np.std(train_data,axis=0)
    
    normalized_train_data = (train_data-means)/stds    
    return normalized_train_data, means, stds

In [18]:
from multiprocessing import Pool

In [76]:
def get_pred(filepath):
    if filepath in feat_dic[TYPE_FEAT]:
        print('using feat')
        x = feat_dic[TYPE_FEAT][filepath]
    else:
        with open(filepath, 'rb') as f:                    
            signal, samplerate = sf.read(f)
            # feat = speech_lib.mfcc(signal,samplerate,winlen=0.060,winstep=0.03,numcep=nfeat,nfilt=nfeat,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=False)
            feat = speech_lib.logfbank(signal, samplerate, nfilt=N_FEAT)
            x = np.expand_dims(np.mean(feat,axis=0),axis=1).T
            x = (x - means) / stds
            feat_dic[TYPE_FEAT][filepath] = x
            # print(feat_dic)
    return clf.predict(x)

def test_classifier(metadata):
    results = []
    for speaker_id, gender in metadata:
        speaker_folder = get_folder(str(int(speaker_id)))
        filepaths = []
        for root, dirs, files in os.walk(speaker_folder):
            for name in files:
                if name.endswith(".flac"):
                    filepaths.append(os.path.join(root,name))
        
        pool = Pool(20)
        preds = pool.map(get_pred, filepaths)
        pool.close()
        pool.join()
    
        # print(preds)
        pred = round(np.mean(preds))
        print(speaker_id, pred, gender)
        results.append(pred == gender)
    print(sum(results) / len(results))
    
    
    

In [49]:
feat_dic = {'mfsc' : {}, 'mfcc' : {}}
x, means, stds = normalize_data(x_mfsc)
clf = svm.SVC(kernel='rbf')
clf.fit(x, np.squeeze(y_mfsc))

SyntaxError: invalid syntax (<ipython-input-49-acc019c355c8>, line 1)

In [78]:
test_classifier(test_metadata)

61 1.0 1
84 0.0 0
116 1.0 1
121 0.0 0
174 0.0 1
237 0.0 0
251 1.0 1
260 1.0 1
367 0.0 0
422 1.0 1
533 0.0 0
652 1.0 1
672 1.0 1
700 0.0 0
777 1.0 1
908 1.0 1
1089 1.0 1
1188 1.0 1
1221 0.0 0
1255 1.0 1
1272 1.0 1
1284 0.0 0
1320 1.0 1
1462 0.0 0
1580 0.0 0
1585 0.0 0
1630 0.0 0
1650 1.0 1
1651 1.0 1
1673 0.0 0
1686 0.0 0
1688 0.0 1
1701 1.0 1
1919 0.0 0
1988 0.0 0
1993 0.0 0
1995 0.0 0
1998 0.0 0
2033 1.0 1
2035 0.0 0
2078 1.0 1
2086 1.0 1
2094 0.0 0
2277 0.0 0
2300 1.0 1
2412 0.0 0
2414 1.0 1
2428 1.0 1
2506 0.0 0
2609 1.0 1
2803 1.0 1
2830 1.0 1
2902 1.0 1
2961 0.0 0
3000 1.0 1
3005 1.0 1
3080 0.0 0
3081 0.0 0
3170 1.0 1
3331 1.0 0
3528 1.0 0
3536 0.0 0
3538 0.0 0
3570 0.0 0
3575 0.0 0
3576 0.0 0
3660 1.0 1
3663 0.0 0
3729 0.0 0
3752 1.0 1
3764 0.0 0
3853 0.0 0
3915 0.0 0
3997 1.0 0
4077 1.0 1
4153 0.0 0
4198 1.0 1
4294 0.0 0
4323 0.0 0
4350 1.0 1
4446 0.0 0
4507 0.0 0
4515 1.0 1
4570 1.0 1
4572 1.0 1
4831 0.0 0
4852 1.0 1
4970 0.0 0
4992 0.0 0
5105 1.0 1
5142 0.0 0
5338 0.0 0
5442 0

In [81]:
pickle.dump(clf, open('/private/home/qiantong/gender_classifier/svm_mfsc40_acc95.bin', 'wb'))

In [82]:
a = set([1,2,3,4])
b = a.pop()
print(a)
print(b)

{2, 3, 4}
1


In [88]:
print(svm)

<module 'sklearn.svm' from '/private/home/qiantong/.local/lib/python3.6/site-packages/sklearn/svm/__init__.py'>
