In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn as sk
import os
import sys
from dotenv import load_dotenv

In [None]:
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from src.sql_handling.execute_sql import execute_sql_pandas

In [None]:
load_dotenv()

# path to save figures
output_path = os.getenv("AUDIO_OUT")

# Global configuration path
glob_conf_path = '../global_config.py'

In [None]:
exec(open(glob_conf_path).read())

# Select data

In [None]:
query = """SELECT *
FROM opensmile_functionals
WHERE mix = 0
AND video_id IN ('A101', 'A102', 'A18', 'A200')"""
df, read_duration = execute_sql_pandas(query)

In [None]:
print("query executed in: {} seconds".format(read_duration))

In [None]:
df

In [None]:
df = df.drop(columns=["file", "start", "end", "mix", "emotion_2", "emotion_2_id",  
                      "proportions", "mode", "intensity_level",
                      "version", "situation"])

In [None]:
df.columns.values

# Data inspection before data cleaning
In this section we will inspect the dataset. For this porpuse, we will focus on the number of files per emotion

In [None]:
print('Files per emotion before data cleaning')
files_per_emotion_count_original = df[['emotion_1','filename']].groupby(['emotion_1']).filename.nunique()
print('Total number of files: {}'.format(files_per_emotion_count_original.sum()))
print(files_per_emotion_count_original)
plt.figure(figsize=(15,5))
sns.barplot(files_per_emotion_count_original.index, files_per_emotion_count_original.values, saturation=sns_saturation, color=blue_rgb)
plt.title('Files per emotion before data cleaning (eGeMAPS)')
plt.xlabel('Emotion ID')
plt.ylabel('Number of Files')
plt.show()
print()

In [None]:
# check for null values
df.isnull().values.any()

# Normalize training set


In [None]:
from sklearn import preprocessing

# eGeMAPS
X = df.drop(columns=['emotion_1','emotion_1_id','filename','video_id'])          # Get features from training set
min_max_scaler_egemaps = preprocessing.MinMaxScaler()                     # eGeMAPS min max scaler
X_scaled = min_max_scaler_egemaps.fit_transform(X)                        # Fit and transform features
X_scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = df[['emotion_1_id','filename','video_id']]
train_scaled_df = pd.concat([X_scaled_df,metadata_df], axis=1)

In [None]:
train_egemaps_scaled_df

In [None]:
train_scaled_df.video_id.unique()

In [None]:
def get_evens(size):
    ret = []
    for n in range(size):
        if n % 2 == 0:
            ret.append(n)
    return ret

def get_odds(size):
    ret = []
    for n in range(size):
        if n % 2 == 1:
            ret.append(n)
    return ret

In [None]:
import random

video_ids = train_scaled_df.video_id.unique()

# Find random pairs of video_ids
random.seed(seed)

# a list of even numbers
video_ids_1 = get_evens(len(video_ids))

# a list of odd numbers
video_ids_2 = get_odds(len(video_ids))

# shuffle the odd numbers
video_ids_2_shuffled = random.sample(video_ids_2, len(video_ids_2))

# assign groups for video ids by using odd and even numbers respectively
groups = {}
for i, video_id in enumerate(video_ids_1):
    groups[video_ids[video_id]] = i
    
for i, video_id in enumerate(video_ids_2_shuffled):
    groups[video_ids[video_id]] = i
    
print(groups)

# Create a copy
train_scaled_groups_df = train_scaled_df.copy()

# Insert group column
train_scaled_groups_df['group'] = train_scaled_groups_df['video_id'].map(groups)

In [None]:
train_scaled_groups_df

In [None]:
save_path = os.path.join(output_path, 'audio_data_egemaps_train.csv')
train_scaled_groups_df.to_csv(save_path, index=None, header=True)