# Lost in Translation
## Classifying Speech Language using Deep Learning 

### Part 2 - Feature Extration

#### Import Library

In [1]:
import os
import numpy as np
import pandas as pd
import IPython.display as ipd
import librosa
import librosa.display
from scipy.io import wavfile

#### Extract MFCC

In [2]:
def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name) 
        X = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        X_scaled = np.mean(X.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
     
    return X_scaled

In [3]:
def save_feature(language, feature_lst):
    path = "../data/"
    lang_dir = os.path.join(path, language)
    files = os.listdir(lang_dir)
    for audio_track in files:
        if audio_track[-3:] == 'wav':
            track = os.path.join(lang_dir, audio_track)
            features = extract_features(track)
            feature_lst.append([features,language])
        else:
            continue
    return feature_lst

In [4]:
# create a pickled dataframe with all the features
lang_lst = ['chinese','dutch','finnish','french','german','greek','hungarian','japanese','russian','spanish']
df = pd.DataFrame(columns=['features','label'])

for lang in lang_lst:
    feature_lst = []
    result = save_feature(lang, feature_lst)
    temp_df = pd.DataFrame(result,columns=['features','label'])
    df = df.append(temp_df,ignore_index=True)

In [5]:
df

Unnamed: 0,features,label
0,"[-328.1553, 33.20999, -14.629965, 25.129225, -...",chinese
1,"[-317.4969, 33.81309, -23.309122, 31.999235, -...",chinese
2,"[-336.15854, 35.530407, -25.493, 18.644785, -2...",chinese
3,"[-333.79565, 31.953947, -13.206911, 22.866724,...",chinese
4,"[-354.22464, 45.220486, -17.17353, 27.299313, ...",chinese
...,...,...
64288,"[-351.8527, 107.06704, -3.2553566, 47.231853, ...",spanish
64289,"[-367.3681, 112.8195, -10.63875, 28.11285, 0.3...",spanish
64290,"[-359.3411, 113.6945, -2.7332597, 25.764277, 2...",spanish
64291,"[-327.3389, 131.14326, -6.5025754, 38.206024, ...",spanish


In [6]:
df.to_pickle("./df_all.pkl")

#### Check the pickle

In [7]:
df_check = pd.read_pickle('./df_all.pkl')

In [8]:
df_check

Unnamed: 0,features,label
0,"[-328.1553, 33.20999, -14.629965, 25.129225, -...",chinese
1,"[-317.4969, 33.81309, -23.309122, 31.999235, -...",chinese
2,"[-336.15854, 35.530407, -25.493, 18.644785, -2...",chinese
3,"[-333.79565, 31.953947, -13.206911, 22.866724,...",chinese
4,"[-354.22464, 45.220486, -17.17353, 27.299313, ...",chinese
...,...,...
64288,"[-351.8527, 107.06704, -3.2553566, 47.231853, ...",spanish
64289,"[-367.3681, 112.8195, -10.63875, 28.11285, 0.3...",spanish
64290,"[-359.3411, 113.6945, -2.7332597, 25.764277, 2...",spanish
64291,"[-327.3389, 131.14326, -6.5025754, 38.206024, ...",spanish
