In [24]:
import librosa
import librosa.display

# Import the audio playback widget
import IPython.display as ipd
from IPython.display import Image
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from scipy.signal import lfilter

### Analyzing an audio file

### Data Preprocessing

In [25]:
'''
LABELS :-
 0 - ANGER
 1 - BORED
 2 - DISGUST
 3 - ANXIETY
 4 - HAPPY
 5 - SAD
 6 - Neutral

GENDER :-
  1 - MALE
  0 - FEMALE
'''
labels_encoded = {'W':0, 'L':1, 'E':2, 'A':3, 'F':4, 'T':5, 'N':6}
gender_encoded = {'03':1 , '10' : 1, '11':1 , '12':1 , '15':1, '08':0, '09':0, '13':0 , '14':0, '16':0}

In [26]:
labels = []
file_paths = []

In [27]:
dataset_path = './wav'
for file_name in os.listdir(dataset_path):
  if file_name.endswith('.wav'):
    file_paths.append(file_name)
    labels.append(labels_encoded[file_name[5]])

In [28]:
df = pd.DataFrame(
    {
        'file_path':file_paths,
        'label':labels,
    }
)


In [29]:
plot_x = np.array(df.label.unique())
plot_x.sort()
emotion_unique_vals = df.label.value_counts(sort = True)
plot_y = []
for x in plot_x:
  plot_y.append(emotion_unique_vals[x])
emotion_unique_vals = pd.DataFrame(emotion_unique_vals)
# plt.bar( plot_x, plot_y)

### Extracting MFCC Features

In [30]:
df['file_path']
df_2_audio_vals = []
x = 1
for audio_file in df['file_path']:
  data, sampling_rate = librosa.load(f'./wav/{audio_file}')
  data_mfcc = librosa.feature.mfcc(y = data , sr = sampling_rate, n_mfcc = 13)
  df_2_audio_vals.append(data_mfcc)



#### Finding maximum windows present in all the audio files

In [31]:
columns_length = [aud.shape[1] for aud in df_2_audio_vals]
max_col_len = max(columns_length)

#### Padding

In [32]:
## Making everything to same length
df_2_audio_vals_padded = df_2_audio_vals.copy()
for ind, arr in enumerate(df_2_audio_vals_padded):
  arr_sh = arr.shape[1]
  zero_cols = max_col_len - arr_sh
  zero_arr = np.zeros((arr.shape[0], zero_cols))
  res_arr = np.hstack((arr, zero_arr))
  df_2_audio_vals_padded[ind] = res_arr

#### Standardizing the data

In [33]:
scaler = StandardScaler()
for ind in range(0, len(df_2_audio_vals_padded)):
  df_2_audio_vals_padded[ind] = scaler.fit_transform(df_2_audio_vals_padded[ind])

In [34]:
df_2 = pd.DataFrame(
    {
        'audio': df_2_audio_vals_padded,
        'labels': labels
    }
)

#### Flattening

In [35]:
## FLATTEN THE MATRIX IN EACH CELL
df_2_flattened = df_2.copy()
df_2_flattened['audio'] = df_2_flattened['audio'].apply(lambda x : x.reshape(-1))

In [36]:
X = df_2_flattened['audio'].values
y = df_2['labels'].values
X_array =[]
y_array=[]
for i in range(len(X)):
    temp=X[i].tolist()
    X_array.append(temp)
    y_array.append(df_2['labels'][i])

X_array = np.array(X_array)
y_array = np.array(y_array)
X = X_array
y = y_array

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model Building

In [38]:
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)

print(accuracy_score(y_test,svm_predictions))


0.5092592592592593


In [40]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, Y_train, Y_test=train_test_split(X,y,test_size=0.2)

model=RandomForestClassifier()
# model.seed(10)
model.fit(X_train,Y_train)
model.score(X_test,Y_test)

0.5092592592592593