# Audio preprocessing

In [None]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def uniform_length(str, max_len=1000):
    ls = list(eval(str))
    if len(ls) < max_len:
        to_append = [0]*(max_len - len(ls))
        ls += to_append
    ls = ls[:max_len]
    return repr(ls)

def add_feature(srs, split, features):
    for feature in srs.index:
        str = srs[feature]
        ls = list(eval(str))
        if feature in features[split].keys():
            features[split][feature] += [ls]
        else:
            features[split][feature] = [ls]

    return features

def get_audio_features(df):

    ignore = ['filename', 'id', 'conversation_line', 'emotion', 'speaker', 'split']
    X_train_audio = np.array(df[df['split'] == 'train'].drop(columns=ignore).values.tolist())
    X_val_audio = np.array(df[df['split'] == 'val'].drop(columns=ignore).values.tolist())
    X_test_audio = np.array(df[df['split'] == 'test'].drop(columns=ignore).values.tolist())

    return X_train_audio, X_val_audio, X_test_audio

def do_oversample(X, labels, type='SMOTE'):
    strategy = {'disgust':500, 'fear':500}
    if type=='SMOTE':
        os = SMOTE(sampling_strategy=strategy)
    else:
        os = RandomOverSampler(sampling_strategy=strategy)
    X, labels = os.fit_resample(X, labels)
    return X, labels

def do_scale(X, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        scaled_X = scaler.fit_transform(X)
        return scaled_X, scaler
    else:
        return scaler.transform(X)

def do_pca(X, pca=None, n_comp=500):
    if pca is None:
        pca = PCA(n_comp)
        pca_X = pca.fit_transform(X)
        return pca_X, pca
    else:
        return pca.transform(X)

# Text preprocessing

In [None]:
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, TFBertModel
import tensorflow as tf


def get_text_features(df):
    train_df = df[df['split'] == 'train']
    val_df = df[df['split'] == 'val']
    test_df = df[df['split'] == 'test']

    train_text = train_df['conversation_line'].to_list()
    val_text = val_df['conversation_line'].to_list()
    test_text = test_df['conversation_line'].to_list()

    return train_text, val_text, test_text
    #return np.array(train_text),np.array(val_text), np.array(test_text)


def bert_encode_text(X_train, X_val, X_test):
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", use_fast=True)
    model = TFBertModel.from_pretrained("google-bert/bert-base-uncased")

    print('Tokenizer is fast?', tokenizer.is_fast)

    def tkn(text):
        print('*')
        return tokenizer(text, return_tensors="tf", padding='max_length', truncation=True, max_length=30)

    def encode(X):
        batch_size = 300
        dataset = tf.data.Dataset.from_tensor_slices(X).batch(batch_size)
        pooled_outputs = []
        i = 0
        for batch in dataset:
            print('batch', i)
            #print(batch.numpy().tolist())
            texts = [str(text.numpy(), 'utf-8') for text in batch]
            inputs = tkn(texts)
            outputs = model(inputs)
            pooled_output = outputs['pooler_output']
            pooled_outputs.append(pooled_output)
            i += 1
        return tf.concat(pooled_outputs, axis=0)

    return encode(X_train), encode(X_val), encode(X_test)

# Make dataset

In [None]:
import json
import logging
import os
from pathlib import Path

import opensmile
import pandas as pd


def get_data_from_json(json_convo, convo_idx):
    """ Extracts text, emotion, file name, and speaker labels from
        a json conversation.
    """

    lines = []
    emotions = []
    filenames = []
    speakers = []
    ids = []
    for line_idx in range(len(json_convo)):
        lines += [json_convo[line_idx]['text']]
        emotions += [json_convo[line_idx]['emotion']]
        filenames += [json_convo[line_idx]['video_name'][:-4]]
        speakers += [json_convo[line_idx]['speaker']]
        # label convo lines so we can still track which conversations they came from
        # and what position they have within the conversation
        id = str(convo_idx) + str(line_idx)
        ids += [id]
    return lines, emotions, filenames, speakers, ids

def extract_audio_features_from_wav(filename, split, smile):
    """ Extracts smile audio features from a wav file. """
    filename += '.wav'
    filepath = interim_data_dir / split / filename

    if not os.path.isfile(filepath):
        return None # split is unknown so we try everywhere, if file not here skip

    result_df = smile.process_file(filepath)
    rename_split = {'train_wav': 'train', 'dev_wav': 'val', 'test_wav': 'test'}
    result_df['split'] = rename_split[split]

    logger = logging.getLogger(__name__)
    logger.info(filename)

    return result_df


def main():
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('building dataset')
    logger.info('loading json data')
    text_data = json.loads(json_data_path.read_text())

    all_lines = []
    all_emotions = []
    all_filenames = []
    all_speakers = []
    all_ids = []

    for convo_idx in range(len(text_data)):
        lines, emotions, filenames, speakers, ids = get_data_from_json(text_data[convo_idx]['conversation'], convo_idx)
        all_lines += lines
        all_emotions += emotions
        all_filenames += filenames
        all_speakers += speakers
        all_ids += ids

    data_dict = {'filename': all_filenames, 'id': all_ids, 'conversation_line' : all_lines, 'emotion': all_emotions, 'speaker': all_speakers}
    df = pd.DataFrame.from_dict(data_dict).astype({'id':str})

    logger.info('done loading json data')
    # rn assume wav have been extracted from mp4 to ../interim
    logger.info(f'extracting smile features from wav files in {interim_data_dir}')

    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.emobase,
        feature_level=opensmile.FeatureLevel.Functionals,
    )

    all_features = []

    for fname in data_dict['filename']:
        features = extract_audio_features_from_wav(fname, 'train_wav', smile)
        if features is None:
            features = extract_audio_features_from_wav(fname, 'dev_wav', smile)
        if features is None:
            features = extract_audio_features_from_wav(fname, 'test_wav', smile)

        all_features += [features]

    audio_features_df = pd.concat(all_features, axis=0)
    audio_features_df.reset_index(drop=True, inplace=True)

    logger.info('done extracting functional smile features from wav')
    logger.info('creating final dataset')

    concatenated_df = pd.concat([df, audio_features_df], axis=1)
    save_path = processed_data_dir / 'processed_func_data.csv'
    concatenated_df.to_csv(save_path, index=False)

    logger.info(f'done creating final dataset. saved to: {save_path}')


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    project_dir = Path(__file__).resolve().parents[2]
    raw_data_dir = project_dir / 'data' / 'raw'
    interim_data_dir = project_dir / 'data' / 'interim'
    processed_data_dir = project_dir / 'data' / 'processed'
    json_data_path = raw_data_dir / 'Subtask_2_train.json'

    main()