In [1]:
import pandas as pd
import numpy as np
import os

import librosa
import moviepy.editor as mp

#use this package to extract mfcc features
import python_speech_features as mfcc
from python_speech_features import delta

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
import time

## Prepare the dataset

In [2]:
#get all the video files
#video_path = r"C:/Users/tedf0/OneDrive/Desktop/DS3/MELD.Raw/train_splits"
audio_path = r"C:\Users\tedf0\OneDrive\Desktop\cogs118\Wavs"
#files = os.listdir(video_path)
wav_files = os.listdir(audio_path)
#error_files when converting data
error_files = ['dia125_utt3.mp4']

In [3]:
data_df = pd.read_csv(r"C:\Users\tedf0\OneDrive\Desktop\cogs118\train_sent_emo.csv")
#drop the row that give us an error audio
error = data_df[(data_df['Dialogue_ID']==125) & (data_df['Utterance_ID']==3)].index
data_df.drop(error, inplace=True)
data_df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So lets talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
...,...,...,...,...,...,...,...,...,...,...,...
9984,10474,You or me?,Chandler,neutral,neutral,1038,13,2,3,"00:00:48,173","00:00:50,799"
9985,10475,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,neutral,neutral,1038,14,2,3,"00:00:51,009","00:00:53,594"
9986,10476,"You guys are messing with me, right?",Joey,surprise,positive,1038,15,2,3,"00:01:00,518","00:01:03,520"
9987,10477,Yeah.,All,neutral,neutral,1038,16,2,3,"00:01:05,398","00:01:07,274"


### Helper Functions

In [4]:
def extract_features(df):
    """
    Extract features from a dataframes of audio files
    Input: A dataframe contains the Dialogue ID and Utterance ID
    Output: A 2D numpy array of features extracted from audio files (MFCC, MFCC_delta, MFCC_delta_delta)
    """
    mfcc_features = np.array([])
    counter = 0 #used to count how many audio files does not have mfcc
    
    for i in range(len(df)):
        entry = df.iloc[i]
        dia = entry['Dialogue_ID']
        utt = entry['Utterance_ID']
        path = audio_path + f"\dia{dia}_utt{utt}.wav"
        audio, sr = librosa.load(path,res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5)
        #print(audio.shape)
        try:
            audio_mfcc = mfcc.mfcc(audio, sr,nfilt=20, nfft=1200, appendEnergy=True)
        except:
            #print('no mfcc from audio')
            counter = counter+1
            continue
            
        audio_mfcc = preprocessing.scale(audio_mfcc)
        delta1 = delta(audio_mfcc, 2)
        delta2 = delta(delta1, 2)
        combined = np.hstack((audio_mfcc, delta1, delta2))
        if mfcc_features.size == 0:
            mfcc_features = combined
        else:
            mfcc_features = np.vstack((mfcc_features, combined))

    return mfcc_features

In [5]:
def get_gender(speaker):
    """
    return the gender of the character
    """
    if speaker in male_speakers:
        return 'Male'
    else:
        return 'Female'
    

def identify_gender(dia, utt, gmm_male, gmm_female):
    """
    Given an audio file and 2 models, identify the gender
    Input: Dialogue ID, Utterance ID, and six models(one for each speaker)
    Output: the speaker
    """
    path = audio_path + f"\dia{dia}_utt{utt}.wav"

    audio, sr = librosa.load(path,res_type='kaiser_fast'
                                      ,duration=2.5
                                      ,sr=44100
                                      ,offset=0.5)
    try:
        audio_mfcc = mfcc.mfcc(audio, sr,nfilt=20, nfft=1200, appendEnergy=True)
    except:
        return None
    audio_mfcc = preprocessing.scale(audio_mfcc)
    delta1 = delta(audio_mfcc, 2)
    delta2 = delta(delta1, 2)
    combined = np.hstack((audio_mfcc, delta1, delta2))
    #print(combined.shape)
    #print(m1.score_samples(combined).shape)
    #print(m1.score(combined))
    male_score = gmm_male.score(combined)
    female_score = gmm_female.score(combined)
    
    if male_score > female_score:
        return "Male"
    else:
        return "Female"

def identify_speaker(dia, utt, m1, m2, m3, m4, m5, m6):
    """
    Given an audio file and six models, identify the speaker
    Input: Dialogue ID, Utterance ID, and six models(one for each speaker)
    Output: the speaker
    """
    path = audio_path + f"\dia{dia}_utt{utt}.wav"

    audio, sr = librosa.load(path,res_type='kaiser_fast'
                                      ,duration=2.5
                                      ,sr=44100
                                      ,offset=0.5)
    try:
        audio_mfcc = mfcc.mfcc(audio, sr,nfilt=20, nfft=1200, appendEnergy=True)
    except:
        return None
    audio_mfcc = preprocessing.scale(audio_mfcc)
    delta1 = delta(audio_mfcc, 2)
    delta2 = delta(delta1, 2)
    combined = np.hstack((audio_mfcc, delta1, delta2))
    scores = np.array([m1.score(combined), m2.score(combined), m3.score(combined),
                         m4.score(combined), m5.score(combined), m6.score(combined)])
    idx = np.argmax(scores)
    
    return main[idx]

## Gender Verification

### Split Test/Train dataset

In [6]:
main = ['Chandler', 'Rachel', 'Ross', 'Joey', 'Monica', 'Phoebe']
male_speakers = ['Chandler', 'Ross', 'Joey']
female_speakers = ['Rachel', 'Monica', 'Phoebe']
main_data = data_df[data_df['Speaker'].isin(main)]
main_data['Gender'] = main_data['Speaker'].apply(get_gender)
X_train, X_test, y_train, y_test = train_test_split(main_data[['Dialogue_ID', 'Utterance_ID','Speaker', 'Gender']], 
                                                    main_data['Gender'], 
                                                    test_size=0.25, shuffle=True, random_state=42)

### Extract Features

In [7]:
male_train_dataset = X_train[X_train['Gender']=='Male']
female_train_dataset = X_train[X_train['Gender']=='Female']

male_mfcc = extract_features(male_train_dataset)
female_mfcc = extract_features(female_train_dataset)

### Training models

In [8]:
male_gmm = GaussianMixture(n_components=8, max_iter=200, covariance_type='diag', n_init=3)
male_gmm.fit(male_mfcc)
female_gmm = GaussianMixture(n_components=8, max_iter=200, covariance_type='diag', n_init=3)
female_gmm.fit(female_mfcc)

GaussianMixture(covariance_type='diag', max_iter=200, n_components=8, n_init=3)

### Predict and calculate accuracy

In [9]:
prediction = X_test.copy()
prediction['Predictions'] = prediction.apply(lambda row: identify_gender(row['Dialogue_ID'], row['Utterance_ID'], 
                                                          male_gmm, female_gmm), axis=1)
prediction

Unnamed: 0,Dialogue_ID,Utterance_ID,Speaker,Gender,Predictions
5430,576,4,Chandler,Male,Male
2586,273,18,Joey,Male,Male
8798,925,4,Chandler,Male,Male
4193,450,7,Phoebe,Female,Female
3285,346,0,Chandler,Male,Male
...,...,...,...,...,...
5008,531,14,Rachel,Female,Female
3846,414,3,Joey,Male,Male
5973,632,0,Monica,Female,Female
5604,596,20,Ross,Male,Male


In [10]:
correct = sum(prediction['Predictions'] == y_test)
accuracy = correct/len(prediction)
accuracy

0.7920077034183919

In [11]:
#train, test and calculate automatically
def train_test_gender_verification(n, cov):
    """
    n - n_components
    cov - covariance_type
    """
    start = time.time()
    male_gmm = GaussianMixture(n_components=n, max_iter=100, covariance_type=cov, n_init=1)
    male_gmm.fit(male_mfcc)
    female_gmm = GaussianMixture(n_components=n, max_iter=100, covariance_type=cov, n_init=1)
    female_gmm.fit(female_mfcc)
    training_ends = time.time()
    train_time = training_ends - start
    
    prediction = X_test.copy()
    prediction['predicted_gender'] = prediction.apply(lambda row: identify_gender(row['Dialogue_ID'], row['Utterance_ID'], 
                                                          male_gmm, female_gmm), axis=1)

    correct_label = (prediction['predicted_gender'] == y_test)
    accuracy = correct_label.sum()/len(prediction)
    predict_time = time.time() - training_ends
    
    print("The accuracy of this model is "+str(accuracy))
    print("The model takes "+str(round(train_time, 2))+" seconds to train")
    print("The model takes "+str(round(predict_time,2))+" seconds to predict")
    return accuracy

In [12]:
#n_component=1, covariance_type='diag'
accuracy = train_test_gender_verification(n=1, cov='diag')

The accuracy of this model is 0.5479056331246991
The model takes 3.15 seconds to train
The model takes 53.43 seconds to predict


In [13]:
#n_component=2, covariance_type='diag'
accuracy = train_test_gender_verification(n=2, cov='diag')

The accuracy of this model is 0.6528647087144921
The model takes 30.88 seconds to train
The model takes 51.83 seconds to predict


In [14]:
#n_component=8, covariance_type='diag'
accuracy = train_test_gender_verification(n=8, cov='diag')

The accuracy of this model is 0.7924891670678864
The model takes 62.05 seconds to train
The model takes 51.76 seconds to predict


In [15]:
#n_component=32, covariance_type='diag'
accuracy = train_test_gender_verification(n=32, cov='diag')

The accuracy of this model is 0.8281174771304767
The model takes 235.94 seconds to train
The model takes 54.87 seconds to predict


In [16]:
#n_component=64, covariance_type='diag'
accuracy = train_test_gender_verification(n=64, cov='diag')

The accuracy of this model is 0.8290804044294656
The model takes 491.45 seconds to train
The model takes 56.9 seconds to predict


## Speaker Recognition

### Split the Train/Test dataset

In [17]:
#select the main characters and split training and testing set
main = ['Chandler', 'Rachel', 'Ross', 'Joey', 'Monica', 'Phoebe']
main_data = data_df[data_df['Speaker'].isin(main)]
X_train, X_test, y_train, y_test = train_test_split(main_data[['Dialogue_ID', 'Utterance_ID','Speaker']], 
                                                    main_data['Speaker'], 
                                                    test_size=0.25, shuffle=True, random_state=42)

### Extract the features for each speaker

In [18]:
chandler = X_train[X_train['Speaker']=='Chandler']
rachel = X_train[X_train['Speaker']=='Rachel']
ross = X_train[X_train['Speaker']=='Ross']
joey = X_train[X_train['Speaker']=='Joey']
monica = X_train[X_train['Speaker']=='Monica']
phoebe = X_train[X_train['Speaker']=='Phoebe']


#process the data, get the mfccs for every speaker
print('start extracting')
chandler_mfcc = extract_features(chandler)
print('chandler done')
rachel_mfcc = extract_features(rachel)
print('rachel done')
ross_mfcc = extract_features(ross)
print('ross done')
joey_mfcc = extract_features(joey)
print('joey done')
monica_mfcc = extract_features(monica)
print('monica done')
phoebe_mfcc = extract_features(phoebe)
print('phoebe done')

start extracting
chandler done
rachel done
ross done
joey done
monica done
phoebe done


### Create the models, one for each speaker

In [19]:
gmm_chandler = GaussianMixture(n_components=32, max_iter=200, covariance_type='diag', n_init=1)
gmm_chandler.fit(chandler_mfcc)
gmm_rachel = GaussianMixture(n_components=32, max_iter=200, covariance_type='diag', n_init=1)
gmm_rachel.fit(rachel_mfcc)
gmm_ross = GaussianMixture(n_components=32, max_iter=200, covariance_type='diag', n_init=1)
gmm_ross.fit(ross_mfcc)
gmm_joey = GaussianMixture(n_components=32, max_iter=200, covariance_type='diag', n_init=1)
gmm_joey.fit(joey_mfcc)
gmm_monica = GaussianMixture(n_components=32, max_iter=200, covariance_type='diag', n_init=1)
gmm_monica.fit(monica_mfcc)
gmm_phoebe = GaussianMixture(n_components=32, max_iter=200, covariance_type='diag', n_init=1)
gmm_phoebe.fit(phoebe_mfcc)

GaussianMixture(covariance_type='diag', max_iter=200, n_components=32)

### Use the model to identify the speakers in the testing set

In [20]:
#n_components = 32
prediction = X_test.copy()
prediction['Predictions'] = prediction.apply(lambda row: identify_speaker(row['Dialogue_ID'], row['Utterance_ID'], 
                                                          gmm_chandler, gmm_rachel, gmm_ross,
                                                         gmm_joey, gmm_monica, gmm_phoebe), axis=1)

correct_label = (prediction['Speaker'] == prediction['Predictions'])
accuracy = correct_label.sum()/len(prediction)

print("The accuracy of this model is "+str(accuracy))

The accuracy of this model is 0.5907558979297063


In [25]:
prediction['predicted_gender'] = prediction['Predictions'].apply(get_gender)
prediction['Gender'] = prediction['Speaker'].apply(get_gender)
sum(prediction['Gender'] == prediction['predicted_gender'])/len(prediction)

0.8435243139142995

In [26]:
test = prediction[prediction['predicted_gender']=='Male']
sum(test['Speaker'] == test['Predictions'])/len(test)

0.6195028680688337

In [27]:
test = prediction[prediction['predicted_gender']=='Female']
sum(test['Speaker'] == test['Predictions'])/len(test)

0.5615906886517944

### Function to run the train and test for different parameters of the model

In [28]:
def train_test_speaker_recognition(n, cov):
    """
    n - n_components
    cov - covariance_type
    """
    start = time.time()
    gmm_chandler = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=1)
    gmm_chandler.fit(chandler_mfcc)
    gmm_rachel = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=1)
    gmm_rachel.fit(rachel_mfcc)
    gmm_ross = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=1)
    gmm_ross.fit(ross_mfcc)
    gmm_joey = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=1)
    gmm_joey.fit(joey_mfcc)
    gmm_monica = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=1)
    gmm_monica.fit(monica_mfcc)
    gmm_phoebe = GaussianMixture(n_components=n, max_iter=200, covariance_type=cov, n_init=1)
    gmm_phoebe.fit(phoebe_mfcc)
    training_ends = time.time()
    train_time = training_ends - start
    
    prediction = X_test.copy()
    prediction['Predictions'] = prediction.apply(lambda row: identify_speaker(row['Dialogue_ID'], row['Utterance_ID'], 
                                                              gmm_chandler, gmm_rachel, gmm_ross,
                                                             gmm_joey, gmm_monica, gmm_phoebe), axis=1)

    correct_label = (prediction['Speaker'] == prediction['Predictions'])
    accuracy = correct_label.sum()/len(prediction)
    predict_time = time.time() - training_ends
    
    print("The accuracy of this model is "+str(accuracy))
    print("The model takes "+str(round(train_time, 2))+" seconds to train")
    print("The model takes "+str(round(predict_time,2))+" seconds to predict")
    
    return accuracy

In [29]:
#n_component=1, covariance_type='diag'
accuracy = train_test_speaker_recognition(n=1, cov='diag')

The accuracy of this model is 0.2128069330765527
The model takes 3.82 seconds to train
The model takes 65.13 seconds to predict


In [30]:
#n_component=6, covariance_type='diag'
accuracy = train_test_speaker_recognition(n=6, cov='diag')

The accuracy of this model is 0.5012036591237362
The model takes 62.45 seconds to train
The model takes 58.64 seconds to predict


In [31]:
#n_component=8, covariance_type='diag'
accuracy = train_test_speaker_recognition(n=8, cov='diag')

The accuracy of this model is 0.5079441502166586
The model takes 68.2 seconds to train
The model takes 57.85 seconds to predict


In [32]:
#n_component=32, covariance_type='diag'
accuracy = train_test_speaker_recognition(n=32, cov='diag')

The accuracy of this model is 0.5936446798266731
The model takes 277.63 seconds to train
The model takes 66.1 seconds to predict


In [33]:
#n_component=64, covariance_type='diag'
accuracy = train_test_speaker_recognition(n=64, cov='diag')

The accuracy of this model is 0.6254212806933076
The model takes 585.6 seconds to train
The model takes 70.9 seconds to predict
