## librosa

In [33]:
import librosa
import librosa.display as display

import pandas as pd 
import numpy as np 
import scipy as sp 
import matplotlib.pyplot as plt 
%matplotlib inline

import os 
import time

In [34]:
def read_captchas(path):
    wavs = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path, file)) and file.endswith('.wav')]
    captchas = []
    for wav in wavs:
        signal, sampling_rate = librosa.load(os.path.join(path,wav), None)
        label = wav.split('.wav')[0]
        captchas.append((signal,sampling_rate,label))
    return captchas

In [35]:
def split_characters(captchas):
    chars = [] 
    for captcha in  captchas:
        signal = captcha[0]
        sampling_rate=captcha[1]
        label=captcha[2]
        chars += [(signal[sampling_rate*2*i:sampling_rate*2*(i+1)],sampling_rate,label[i]) for i in range(4)]
    return chars

In [36]:
def  extract_features (chars):
    data = pd.DataFrame()
    for j, char in enumerate(chars):
        signal = char[0]
        sampling_rate=char[1]
        label=char[2]
        row = pd.DataFrame()
        row['label'] = [label]
#         row['signal_mode'] = [np.mean(sp.stats.mode(signal))]
#         row['signal_min'] = [np.min(signal)]
#         row['signal_max'] = [np.max(signal)]
#         row['signal_mean'] = [np.mean(signal)]
#         row['signal_std'] = [np.std(signal)]
#         row['signal_median'] = [np.median(signal)]
#         row['signal_iqr'] = [sp.stats.iqr(signal)]
#         row['signal_kutosis'] = [sp.stats.kurtosis(signal)]
#         row['signal_skewness'] = [sp.stats.skew(signal)]   
        mfcc = librosa.feature.mfcc(signal,sampling_rate)
        for i,mfcc in enumerate(mfcc):
            row['mfcc_'+str(i)+'_mode'] = [np.mean(sp.stats.mode(mfcc))]
            row['mfcc_'+str(i)+'_min'] = [np.min(mfcc)]
            row['mfcc_'+str(i)+'_max'] = [np.max(mfcc)]
            row['mfcc_'+str(i)+'_mean'] = [np.mean(mfcc)]
            row['mfcc_'+str(i)+'_std'] = [np.std(mfcc)]
            row['mfcc_'+str(i)+'_median'] = [np.median(mfcc)]
            row['mfcc_'+str(i)+'_iqr'] = [sp.stats.iqr(mfcc)]
            row['mfcc_'+str(i)+'_kutosis'] = [sp.stats.kurtosis(mfcc)]
            row['mfcc_'+str(i)+'_skewness'] = [sp.stats.skew(mfcc)]
        data = data.append(row)
    return data.reset_index(drop=True)

In [37]:
start = time.time()
train_captchas = read_captchas('./fase_1_corrigida/base_treinamento_I')
print('read captchas: ', len(train_captchas))
train_chars = split_characters(train_captchas)
print('splited chars: ', len(train_chars))
train_data = extract_features(train_chars)
end = time.time()
print(end-start)

print('final dataframe: ', train_data.shape)
train_data.to_csv('train_data_librosa.csv',index=False)

read captchas:  200
splited chars:  800
153.75114607810974
final dataframe:  (800, 181)


In [38]:
start = time.time()
valid_captchas  = read_captchas('./fase_1_corrigida/base_validacao_I')
print('read captchas: ', len(valid_captchas))
valid_chars = split_characters(valid_captchas)
print('splited chars: ', len(valid_chars))
valid_data = extract_features(valid_chars)
end = time.time()
print(end-start)

print('final dataframe: ', valid_data.shape)
valid_data.to_csv('valid_data_librosa.csv',index=False)

read captchas:  147
splited chars:  588
122.09537196159363
final dataframe:  (588, 181)


## python_speech_features

In [39]:
import python_speech_features as psf
from python_speech_features import mfcc,fbank
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav

In [40]:
def read_captchas_psf(path):
    files_name = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path, file)) and file.endswith('.wav')]
    captchas = []
    for audio in files_name:
        sampling_rate, signal = wav.read(path+audio)
        label = audio.split('.wav')[0]
        captchas.append((signal,sampling_rate,label))
    return captchas

### mfcc_psf

In [41]:
def extract_features_mfcc_psf (chars):
    data = pd.DataFrame()
    for j, char in enumerate(chars):
        signal = char[0]
        sampling_rate=char[1]
        label=char[2]
        row = pd.DataFrame()
        row['label'] = [label]
        mfcc_psf = mfcc(signal,sampling_rate)
        for i,mfcc_psf in enumerate(mfcc_psf):
            row['mfcc_'+str(i)+'_mode'] = [np.mean(sp.stats.mode(mfcc_psf))]
            row['mfcc_'+str(i)+'_min'] = [np.min(mfcc_psf)]
            row['mfcc_'+str(i)+'_max'] = [np.max(mfcc_psf)]
            row['mfcc_'+str(i)+'_mean'] = [np.mean(mfcc_psf)]
            row['mfcc_'+str(i)+'_std'] = [np.std(mfcc_psf)]
            row['mfcc_'+str(i)+'_median'] = [np.median(mfcc_psf)]
            row['mfcc_'+str(i)+'_iqr'] = [sp.stats.iqr(mfcc_psf)]
            row['mfcc_'+str(i)+'_kutosis'] = [sp.stats.kurtosis(mfcc_psf)]
            row['mfcc_'+str(i)+'_skewness'] = [sp.stats.skew(mfcc_psf)]
        data = data.append(row)
    return data.reset_index(drop=True)

In [42]:
start = time.time()
train_captchas_psf = read_captchas_psf('./fase_1_corrigida/base_treinamento_I/')
print('read captchas: ', len(train_captchas_psf))
train_chars_psf = split_characters(train_captchas_psf)
print('splited chars: ', len(train_chars_psf))
train_data_mfcc_psf = extract_features_mfcc_psf(train_chars)
end = time.time()
print(end-start)

print('final dataframe: ', train_data_mfcc_psf.shape)
train_data_mfcc_psf.to_csv('train_data_mfcc_psf.csv',index=False)



read captchas:  200
splited chars:  800


























2743.9707467556
final dataframe:  (800, 1792)


In [44]:
start = time.time()
valid_captchas_psf  = read_captchas_psf('./fase_1_corrigida/base_validacao_I/')
print('read captchas: ', len(valid_captchas_psf))
valid_chars_psf = split_characters(valid_captchas_psf)
print('splited chars: ', len(valid_chars_psf))
valid_data_mfcc_psf = extract_features_mfcc_psf(valid_chars_psf)
end = time.time()
print(end-start)

print('final dataframe: ', valid_data_mfcc_psf.shape)
valid_data_mfcc_psf.to_csv('valid_data_mfcc_psf.csv',index=False)



read captchas:  147
splited chars:  588




















653.7976539134979
final dataframe:  (588, 1792)


### delta_mfcc_psf

In [45]:
def extract_features_delta_mfcc_psf (chars):
    data = pd.DataFrame()
    for j, char in enumerate(chars):
        signal = char[0]
        sampling_rate=char[1]
        label=char[2]
        row = pd.DataFrame()
        row['label'] = [label]
        mfcc_psf = mfcc(signal,sampling_rate)
        mfcc_psf_delta = delta(mfcc_psf, 2)
        for i,mfcc_psf_delta in enumerate(mfcc_psf_delta):
            row['mfcc_'+str(i)+'_mode'] = [np.mean(sp.stats.mode(mfcc_psf_delta))]
            row['mfcc_'+str(i)+'_min'] = [np.min(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_max'] = [np.max(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_mean'] = [np.mean(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_std'] = [np.std(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_median'] = [np.median(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_iqr'] = [sp.stats.iqr(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_kutosis'] = [sp.stats.kurtosis(mfcc_psf_delta)]
            row['mfcc_'+str(i)+'_skewness'] = [sp.stats.skew(mfcc_psf_delta)]
        data = data.append(row)
    return data.reset_index(drop=True)

In [46]:
start = time.time()
train_captchas_psf = read_captchas_psf('./fase_1_corrigida/base_treinamento_I/')
print('read captchas: ', len(train_captchas_psf))
train_chars_psf = split_characters(train_captchas_psf)
print('splited chars: ', len(train_chars_psf))
train_data_delta_mfcc_psf = extract_features_delta_mfcc_psf(train_chars)
end = time.time()
print(end-start)

print('final dataframe: ', train_data_delta_mfcc_psf.shape)
train_data_delta_mfcc_psf.to_csv('train_data_delta_mfcc_psf.csv',index=False)



read captchas:  200
splited chars:  800


























1176.8677802085876
final dataframe:  (800, 1792)


In [47]:
start = time.time()
valid_captchas_psf  = read_captchas_psf('./fase_1_corrigida/base_validacao_I/')
print('read captchas: ', len(valid_captchas_psf))
valid_chars_psf = split_characters(valid_captchas_psf)
print('splited chars: ', len(valid_chars_psf))
valid_data_delta_mfcc_psf = extract_features_delta_mfcc_psf(valid_chars_psf)
end = time.time()
print(end-start)

print('final dataframe: ', valid_data_delta_mfcc_psf.shape)
valid_data_delta_mfcc_psf.to_csv('valid_data_delta_mfcc_psf.csv',index=False)



read captchas:  147
splited chars:  588




















90539.664031744
final dataframe:  (588, 1792)


## Model Selection librosa

In [48]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, chi2, SelectKBest, SelectPercentile
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [49]:
train_data = pd.read_csv('train_data_librosa.csv')
valid_data = pd.read_csv('valid_data_librosa.csv')

X_train = train_data.drop(['label'],axis=1)
y_train = train_data['label']

X_valid = valid_data.drop(['label'],axis=1)
y_valid = valid_data['label']

In [50]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_valid = pd.DataFrame(scaler.transform(X_valid),columns=X_valid.columns)

In [54]:
estimators  = [GaussianNB(),KNeighborsClassifier(),LogisticRegression(),
              DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier()]

In [55]:
param_grids = [{},
               {'n_neighbors':[2,5,7,10,12,15], 'weights':['uniform','distance'],'p':[1,2]},
               {'C':np.logspace(-4,4,9),'penalty':['l1','l2'],'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,15,10),'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,15,10),'n_estimators':[50,100],'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,10,9),'n_estimators':[50,100]}
         ]

In [56]:
X = np.concatenate((X_train,X_valid),axis=0)
y = np.concatenate((y_train,y_valid),axis=0)
test_fold = []
for i in range(len(X_train)):
    test_fold.append(-1)
for i in range(len(X_valid)):
    test_fold.append(0)

cv = PredefinedSplit(test_fold=test_fold)

In [57]:
best_estimator = None
best_score = 0
start=time.time()
print('estimator', 'score' )
for estimator,param_grid in zip(estimators,param_grids):
    gridsearch = GridSearchCV(estimator,param_grid,scoring='accuracy',cv=cv)
    gridsearch.fit(X,y)
    print(str(estimator).split('(')[0], gridsearch.best_score_)
    if gridsearch.best_score_ > best_score:
        best_score = gridsearch.best_score_
        best_estimator = gridsearch.best_estimator_
end=time.time()
print('best: ', str(best_estimator).split('(')[0], best_score)
print('minutes elapsed: ',(end-start)/60)

estimator score
GaussianNB 0.54421768707483
KNeighborsClassifier 0.7704081632653061
LogisticRegression 0.7891156462585034
DecisionTreeClassifier 0.5408163265306123
RandomForestClassifier 0.7738095238095238
GradientBoostingClassifier 0.7108843537414966
best:  LogisticRegression 0.7891156462585034
minutes elapsed:  7.341288916269938


## Model selection python_speech_features mfcc

In [71]:
train_data = pd.read_csv('train_data_mfcc_psf.csv')
valid_data = pd.read_csv('valid_data_mfcc_psf.csv')

X_train = train_data.drop(['label'],axis=1)
y_train = train_data['label']

X_valid = valid_data.drop(['label'],axis=1)
y_valid = valid_data['label']

In [77]:
X_train = X_train.fillna(0)
X_valid = X_valid.fillna(0)

In [79]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_valid = pd.DataFrame(scaler.transform(X_valid),columns=X_valid.columns)

In [80]:
estimators  = [GaussianNB(),KNeighborsClassifier(),LogisticRegression(),
              DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier()]

In [81]:
param_grids = [{},
               {'n_neighbors':[2,5,7,10,12,15], 'weights':['uniform','distance'],'p':[1,2]},
               {'C':np.logspace(-4,4,9),'penalty':['l1','l2'],'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,15,10),'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,15,10),'n_estimators':[50,100],'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,10,9),'n_estimators':[50,100]}
         ]

In [82]:
X = np.concatenate((X_train,X_valid),axis=0)
y = np.concatenate((y_train,y_valid),axis=0)
test_fold = []
for i in range(len(X_train)):
    test_fold.append(-1)
for i in range(len(X_valid)):
    test_fold.append(0)

cv = PredefinedSplit(test_fold=test_fold)

In [83]:
best_estimator = None
best_score = 0
start=time.time()
print('estimator', 'score' )
for estimator,param_grid in zip(estimators,param_grids):
    gridsearch = GridSearchCV(estimator,param_grid,scoring='accuracy',cv=cv)
    gridsearch.fit(X,y)
    print(str(estimator).split('(')[0], gridsearch.best_score_)
    if gridsearch.best_score_ > best_score:
        best_score = gridsearch.best_score_
        best_estimator = gridsearch.best_estimator_
end=time.time()
print('best: ', str(best_estimator).split('(')[0], best_score)
print('minutes elapsed: ',(end-start)/60)

estimator score
GaussianNB 0.07993197278911565
KNeighborsClassifier 0.12414965986394558
LogisticRegression 0.11054421768707483
DecisionTreeClassifier 0.11904761904761904
RandomForestClassifier 0.1054421768707483


KeyboardInterrupt: 

## Model selection python_speech_features delta mfcc

In [84]:
train_data = pd.read_csv('train_data_delta_mfcc_psf.csv')
valid_data = pd.read_csv('valid_data_delta_mfcc_psf.csv')

X_train = train_data.drop(['label'],axis=1)
y_train = train_data['label']

X_valid = valid_data.drop(['label'],axis=1)
y_valid = valid_data['label']

In [85]:
X_train = X_train.fillna(0)
X_valid = X_valid.fillna(0)

In [86]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_valid = pd.DataFrame(scaler.transform(X_valid),columns=X_valid.columns)

In [87]:
estimators  = [GaussianNB(),KNeighborsClassifier(),LogisticRegression(),
              DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier()]

In [88]:
param_grids = [{},
               {'n_neighbors':[2,5,7,10,12,15], 'weights':['uniform','distance'],'p':[1,2]},
               {'C':np.logspace(-4,4,9),'penalty':['l1','l2'],'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,15,10),'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,15,10),'n_estimators':[50,100],'class_weight':[None,'balanced']},
               {'max_depth':np.linspace(2,10,9),'n_estimators':[50,100]}
         ]

In [89]:
X = np.concatenate((X_train,X_valid),axis=0)
y = np.concatenate((y_train,y_valid),axis=0)
test_fold = []
for i in range(len(X_train)):
    test_fold.append(-1)
for i in range(len(X_valid)):
    test_fold.append(0)

cv = PredefinedSplit(test_fold=test_fold)

In [None]:
best_estimator = None
best_score = 0
start=time.time()
print('estimator', 'score' )
for estimator,param_grid in zip(estimators,param_grids):
    gridsearch = GridSearchCV(estimator,param_grid,scoring='accuracy',cv=cv)
    gridsearch.fit(X,y)
    print(str(estimator).split('(')[0], gridsearch.best_score_)
    if gridsearch.best_score_ > best_score:
        best_score = gridsearch.best_score_
        best_estimator = gridsearch.best_estimator_
end=time.time()
print('best: ', str(best_estimator).split('(')[0], best_score)
print('minutes elapsed: ',(end-start)/60)

estimator score
GaussianNB 0.21258503401360543
KNeighborsClassifier 0.2465986394557823
LogisticRegression 0.21598639455782312
DecisionTreeClassifier 0.17857142857142858
RandomForestClassifier 0.24149659863945577
