<a href="https://colab.research.google.com/github/swilsonmfc/anomaly/blob/master/DatasetSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Speech
![alt text](https://greatspeech.com/wp-content/uploads/2015/05/public-speaking-3926344_19201-1030x687.jpg)

# Notes
The real-world speech data set consists of 3686 segments of English speech spoken with different accents. This dataset is provided by the Speech Processing Group at Brno University of Technology, Czech Republic. The majority data corresponds to American accent and only 1.65% corresponds to one of seven other accents (these are referred to as outliers). The speech segments are represented by 400-dimensional so called i-vectors which are widely used state-of-the-art features for speaker and language recognition. It is a subset of data described [here](http://www.fit.vutbr.cz/research/groups/speech/publi/2012/brummer_odyssey2012_216-223-40.pdf).

# Setup

In [2]:
import pandas as pd
import numpy as np
from scipy.io import loadmat

# Data

In [1]:
!wget https://www.dropbox.com/s/w6xv51ctea6uauc/speech.mat 

In [4]:
mat = loadmat('speech.mat')

In [5]:
df = pd.DataFrame(mat['X'])
df = df.add_prefix('X')

In [6]:
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X390,X391,X392,X393,X394,X395,X396,X397,X398,X399
0,-0.35069,0.52721,-1.62470,-1.334000,-1.03020,1.247500,-0.98067,-1.558700,-0.061233,1.048200,...,0.437720,0.570960,-1.19840,2.168600,-0.30353,1.21620,-0.607070,0.503820,-0.45571,0.86283
1,-0.42537,-0.08818,0.84575,0.881570,1.35690,0.748940,-1.68620,0.461510,0.585700,0.775950,...,-0.016239,-0.582980,-0.35666,-0.638000,0.64106,-0.94488,-1.069700,-0.024029,0.28696,-0.12374
2,-1.22100,0.43960,-0.06303,0.709530,0.95079,-0.218740,-1.24850,-1.628600,-1.339700,-1.858500,...,1.985500,-0.042742,2.77110,1.549400,-1.75680,-0.91595,-0.078469,0.097137,-0.55111,0.69951
3,-0.30132,0.55221,0.13769,0.715290,0.99311,-0.169590,-0.93405,-0.948250,-0.750880,0.003107,...,1.172300,-0.103520,2.53710,2.096000,-1.43680,-1.05230,-0.070276,-0.539810,-1.10380,0.70562
4,0.31292,0.42762,0.50348,0.005489,-0.94274,-0.176190,-0.50775,-0.464450,0.694750,0.376040,...,-1.317400,1.060700,1.39660,0.096776,-1.18360,-0.37802,0.357150,1.278600,-0.40139,0.11640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3681,-0.73029,0.41857,-0.41746,-0.454210,0.71168,-0.052522,-1.86610,0.220370,0.488910,-2.072700,...,-0.229240,-0.586100,1.47180,0.263640,-0.68870,-1.10260,0.467970,0.481060,-0.29457,0.55751
3682,-0.29375,1.46840,-0.28178,-1.411000,-0.91399,1.427700,0.68553,-0.549750,0.025032,0.857950,...,1.569100,0.575650,0.58935,2.316300,1.00600,-0.15520,-0.343240,-0.023543,-1.57520,-0.37716
3683,0.94843,0.35538,-1.04010,0.292630,0.17207,-0.549300,-2.22700,0.111300,0.403860,0.000354,...,-2.006600,0.544540,2.01170,0.584910,-0.43648,-0.11118,0.197870,0.189580,1.04610,-0.30757
3684,1.49800,-0.66785,1.04800,-1.569300,-0.65241,1.957500,0.45265,0.807050,0.601570,0.813750,...,-0.648440,-0.453780,-0.29842,-0.641800,0.49518,-2.56460,-0.708690,0.369670,-0.49249,-0.40257


# Outlier

In [7]:
outlier_df = pd.DataFrame(mat['y'])
outlier_df.columns = ['OUTLIER']

In [8]:
outlier_df.head()

Unnamed: 0,OUTLIER
0,1
1,1
2,1
3,1
4,1


# Save

In [9]:
export_df = pd.concat([df, outlier_df], axis=1)

In [10]:
export_df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,OUTLIER
0,-0.35069,0.52721,-1.6247,-1.334,-1.0302,1.2475,-0.98067,-1.5587,-0.061233,1.0482,...,0.57096,-1.1984,2.1686,-0.30353,1.2162,-0.60707,0.50382,-0.45571,0.86283,1
1,-0.42537,-0.08818,0.84575,0.88157,1.3569,0.74894,-1.6862,0.46151,0.5857,0.77595,...,-0.58298,-0.35666,-0.638,0.64106,-0.94488,-1.0697,-0.024029,0.28696,-0.12374,1
2,-1.221,0.4396,-0.06303,0.70953,0.95079,-0.21874,-1.2485,-1.6286,-1.3397,-1.8585,...,-0.042742,2.7711,1.5494,-1.7568,-0.91595,-0.078469,0.097137,-0.55111,0.69951,1
3,-0.30132,0.55221,0.13769,0.71529,0.99311,-0.16959,-0.93405,-0.94825,-0.75088,0.003107,...,-0.10352,2.5371,2.096,-1.4368,-1.0523,-0.070276,-0.53981,-1.1038,0.70562,1
4,0.31292,0.42762,0.50348,0.005489,-0.94274,-0.17619,-0.50775,-0.46445,0.69475,0.37604,...,1.0607,1.3966,0.096776,-1.1836,-0.37802,0.35715,1.2786,-0.40139,0.1164,1


In [11]:
!mkdir -p data

In [12]:
export_df.to_csv('data/data_anomaly_speech.csv', index=0)