In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("./data/EMNS/wav_metadata.csv")
df.head()

Unnamed: 0,id,utterance,description,emotion,date_created,status,gender,age,level,audio_recording,user_id
0,1,Hannu wrote of his expedition in stone.,{user_id} said {transcription} in a {emotion} ...,Sarcastic,2022-05-04 15:57:16.957078,Complete,Female,20s,3,recorded_audio_A0MwoOW.wav,3
1,2,Little India differs from many other neighbour...,{transcription} said {user_id} in a {emotion} ...,Excited,2022-05-04 16:04:21.841726,Complete,Female,20s,6,recorded_audio_nZB5ujA.wav,3
2,3,Users had the ability to vote on the songs the...,{user_id} said {transcription} With a {emotion...,Neutral,2022-05-04 16:24:47.767515,Complete,Female,20s,0,recorded_audio.wav,3
3,4,"His major sponsor is Algario Communications, a...","In an {emotion} voice, {user_id} says {transcr...",Surprised,2022-05-04 16:09:25.118523,Complete,Female,20s,8,recorded_audio_6emr1kD.wav,3
4,5,The mansion also has a marriage office.,"In an {emotion} voice, {user_id} says {transcr...",Sarcastic,2022-05-11 10:29:13.448873,Complete,Female,20s,4,recorded_audio_ReBGM47.wav,3


In [4]:
df.drop(
    ["id", "description", "date_created", "status", "gender", "age", "user_id"],
    axis=1,
    inplace=True,
)
print(df.shape)
df.head()

(1181, 4)


Unnamed: 0,utterance,emotion,level,audio_recording
0,Hannu wrote of his expedition in stone.,Sarcastic,3,recorded_audio_A0MwoOW.wav
1,Little India differs from many other neighbour...,Excited,6,recorded_audio_nZB5ujA.wav
2,Users had the ability to vote on the songs the...,Neutral,0,recorded_audio.wav
3,"His major sponsor is Algario Communications, a...",Surprised,8,recorded_audio_6emr1kD.wav
4,The mansion also has a marriage office.,Sarcastic,4,recorded_audio_ReBGM47.wav


In [5]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
emotions_count = train_df["emotion"].value_counts()
print(emotions_count)

emotion
Disgust      129
Surprised    127
Neutral      123
Excited      122
Happy        119
Sad          109
Sarcastic    109
Angry        106
Name: count, dtype: int64


In [7]:
emotions_count = valid_df["emotion"].value_counts()
print(emotions_count)

emotion
Sad          38
Happy        38
Disgust      33
Sarcastic    30
Excited      28
Neutral      26
Surprised    24
Angry        20
Name: count, dtype: int64


In [8]:
all_emotions = [
    "Neutral",
    "Surprised",
    "Happy",
    "Sad",
    "Angry",
    "Disgust",
    "Excited",
    "Sarcastic",
]

In [9]:
def convert_to_vector(emotion):
    base_vector = [0.0] * len(all_emotions)
    index = all_emotions.index(emotion)
    base_vector[index] = 1.0

    return base_vector

In [10]:
train_df["emotion"] = train_df["emotion"].apply(convert_to_vector)
valid_df["emotion"] = valid_df["emotion"].apply(convert_to_vector)

valid_df.rename(columns={"level": "augment"}, inplace=True)
valid_df["augment"] = 0
valid_df = valid_df[["utterance", "emotion", "audio_recording", "augment"]]

# train_df.to_csv("./data/EMNS/training.csv", index=False, header=False, sep="|")
valid_df.to_csv("./data/EMNS/validation.csv", index=False, header=False, sep="|")

In [11]:
train_df.rename(columns={'level': 'augment'}, inplace=True)
train_df["augment"] = 0
train_df_copy = train_df.copy()
train_df_copy["augment"] = 1
train_df = pd.concat([train_df, train_df_copy], axis=0).reset_index(drop=True)
train_df = train_df[["utterance", "emotion", "audio_recording", "augment"]]
train_df.head()

Unnamed: 0,utterance,emotion,audio_recording,augment
0,Moore's charity work includes raising money fo...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",recorded_audio_RpsB2D1.wav,0
1,Aerobic conditioning can increase the duration...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",recorded_audio_W4Q77Qg.wav,0
2,"Apparently married young, she had one child wh...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",recorded_audio_jCU2Rdz.wav,0
3,Among her most well-known stories is How Ms. P...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",recorded_audio_xNCWmyp.wav,0
4,The team's mascots are Ozzie T. Cougar and his...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",recorded_audio_ekebBzk.wav,0


In [12]:
train_df.to_csv("./data/EMNS/training.csv", index=False, header=False, sep="|")