# EmoBERT
The notebook is for detecting emotion on all the utterances in the dialogue dataset. The classifier is built with BERT, a language model mainly used in natural language processing. More details can be seen [HERE](https://github.com/anuradha1992/EmpatheticIntents.git).

In [None]:
import tensorflow as tf

tf.compat.v1.enable_eager_execution()

In [None]:
# After eager execution is enabled, operations are executed as they are
# defined and Tensor objects hold concrete values, which can be accessed as
# numpy.ndarray`s through the numpy() method.
assert tf.multiply(6, 7).numpy() == 42

print(tf.multiply(6, 7).numpy())

42


In [None]:
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

print(tf.config.list_physical_devices('GPU'))
print(tf.test.is_gpu_available())
print(tf.test.is_built_with_cuda())

[]
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
False
True


In [None]:
emotions = ['afraid',
            'angry',
            'annoyed',
            'anticipating',
            'anxious',
            'apprehensive',
            'ashamed',
            'caring',
            'confident',
            'content',
            'devastated',
            'disappointed',
            'disgusted',
            'embarrassed',
            'excited',
            'faithful',
            'furious',
            'grateful',
            'guilty',
            'hopeful',
            'impressed',
            'jealous',
            'joyful',
            'lonely',
            'nostalgic',
            'prepared',
            'proud',
            'sad',
            'sentimental',
            'surprised',
            'terrified',
            'trusting']

ED_emotions = ['afraid', 'angry','annoyed',
            'anticipating','anxious','apprehensive','ashamed','caring','confident','content','devastated','disappointed',
            'disgusted','embarrassed','excited','faithful','furious','grateful','guilty','hopeful','impressed','jealous',
            'joyful','lonely','nostalgic','prepared','proud','sad','sentimental','surprised','terrified','trusting',
            'agreeing','acknowledging','encouraging','consoling','sympathizing','suggesting','questioning','wishing','neutral']

path = '/content/gdrive/My Drive/Colab/'

In [None]:
import math

# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dropout_rate, name = 'multi_head_attention'):
        super().__init__(name = name)

        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model, name = 'query')
        self.wk = tf.keras.layers.Dense(d_model, name = 'key')
        self.wv = tf.keras.layers.Dense(d_model, name = 'value')

        self.dropout = tf.keras.layers.Dropout(dropout_rate, name = 'mha_dropout')
        self.dense = tf.keras.layers.Dense(d_model, name = 'mha_output')

    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm = [0, 2, 1, 3])

    def scaled_dot_product_attention(self, q, k, v, mask):
        """
        Calculate the attention weights.
        q, k, v must have matching leading dimensions.
        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
        The mask has different shapes depending on its type(padding or look ahead) 
        but it must be broadcastable for addition.
        Args:
            q: query shape == (..., seq_len_q, depth)
            k: key shape == (..., seq_len_k, depth)
            v: value shape == (..., seq_len_v, depth_v)
            mask: Float tensor with shape broadcastable 
                to (..., seq_len_q, seq_len_k). Defaults to None.
        
        Returns:
            output, attention_weights
        """

        matmul_qk = tf.matmul(q, k, transpose_b = True)  # (..., seq_len_q, seq_len_k)

        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)  

        # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1.
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)  # (..., seq_len_q, seq_len_k)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        # (As claimed in the RoBERTa implementation.)
        attention_weights = self.dropout(attention_weights)

        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

        return output, attention_weights

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = self.scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm = [0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


def gelu(x):
    """
    Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + tf.math.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * tf.math.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + tf.math.erf(x / math.sqrt(2.0)))

act_funcs = {'gelu': gelu, 'relu': tf.nn.relu}

# Pointwise Feed Forward Network
def point_wise_feed_forward_network(d_model, dff, hidden_act):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation = act_funcs[hidden_act],
            name = 'ff_hidden'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model, name = 'ff_output')  # (batch_size, seq_len, d_model)
    ], name = 'ff_network')


# Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, hidden_act, dropout_rate, layer_norm_eps, layer_num):
        super().__init__(name = 'encoder_layer_{:02d}'.format(layer_num))

        self.mha = MultiHeadAttention(d_model, num_heads, dropout_rate)
        self.ffn = point_wise_feed_forward_network(d_model, dff, hidden_act)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = layer_norm_eps,
            name = 'layernorm_1')
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = layer_norm_eps,
            name = 'layernorm_2')

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate, name = 'dropout_1')
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate, name = 'dropout_2')

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training = training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training = training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [None]:
def loss_function(real_emot, pred_emot):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits = True, reduction = 'none')
    loss_ = scce(real_emot, pred_emot)
    return loss_

class EmoBERT(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, hidden_act, dropout_rate,
                 layer_norm_eps, max_position_embed, vocab_size, num_emotions):
        super().__init__(name = 'emo_bert')

        self.padding_idx = 1

        # Embedding layers
        self.word_embeddings = tf.keras.layers.Embedding(vocab_size, d_model, name = 'word_embed')
        self.pos_embeddings = tf.keras.layers.Embedding(max_position_embed, d_model, name = 'pos_embed')
        self.layernorm = tf.keras.layers.LayerNormalization(epsilon = layer_norm_eps,
            name = 'layernorm_embed')
        self.dropout = tf.keras.layers.Dropout(dropout_rate, name = 'dropout_embed')

        # Encoder layers
        self.num_layers = num_layers
        self.enc_layers = [
            EncoderLayer(d_model, num_heads, dff, hidden_act, dropout_rate, layer_norm_eps, i)
            for i in range(num_layers)
        ]

        # Output layers
        self.attention_v = tf.keras.layers.Dense(1, use_bias = False, name = 'attention_v')
        self.attention_layer = tf.keras.layers.Dense(d_model, activation = 'tanh', name = 'attention_layer')
        self.hidden_layer = tf.keras.layers.Dense(d_model, activation = 'tanh', name = 'hidden_layer')
        self.output_layer = tf.keras.layers.Dense(num_emotions, name = 'output_layer')

    def call(self, x, training, mask):
        # x.shape == (batch_size, seq_len)

        seq_len = tf.shape(x)[1]

        # Add word embedding and position embedding.
        pos = tf.range(self.padding_idx + 1, seq_len + self.padding_idx + 1)
        pos = tf.broadcast_to(pos, tf.shape(x))
        x = self.word_embeddings(x)  # (batch_size, seq_len, d_model)
        x += self.pos_embeddings(pos)

        x = self.layernorm(x)
        x = self.dropout(x, training = training)

        # x.shape == (batch_size, seq_len, d_model)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        # Compute the attention scores
        projected = self.attention_layer(x)  # (batch_size, seq_len, d_model)
        logits = tf.squeeze(self.attention_v(projected), 2)  # (batch_size, seq_len)
        logits += (tf.squeeze(mask) * -1e9)  # Mask out the padding positions
        scores = tf.expand_dims(tf.nn.softmax(logits), 1)  # (batch_size, 1, seq_len)

        # x.shape == (batch_size, d_model)
        x = tf.squeeze(tf.matmul(scores, x), 1)

        x = self.hidden_layer(x)
        x = self.output_layer(x)

        return x  # (batch_size, num_emotions)

In [None]:
# Masking
def create_padding_mask(seq):
    # To be consistent with RoBERTa, the padding index is set to 1.
    seq = tf.cast(tf.math.equal(seq, 1), tf.float32)

    # Add extra dimensions so that we can add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_masks(inp):
    enc_padding_mask = create_padding_mask(inp)
    return enc_padding_mask

def build_model(model, max_length, vocab_size):
    inp = np.ones((1, max_length), dtype = np.int32)
    inp[0,:max_length//2] = np.random.randint(2, vocab_size, size = max_length//2)
    inp = tf.constant(inp)
    enc_padding_mask = create_masks(inp)
    _ = model(inp, True, enc_padding_mask)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, peak_lr, total_steps, warmup_steps):
        super(CustomSchedule, self).__init__()

        self.peak_lr = peak_lr
        self.total_steps = total_steps
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = step / self.warmup_steps
        arg2 = (self.total_steps - step) / (self.total_steps - self.warmup_steps)
        return self.peak_lr * tf.math.minimum(arg1, arg2)

In [None]:
!pip install pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 13.6MB/s eta 0:00:01
Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/28/78/4067ce89180daf0b2027df4b3e4c4734d73b99c3a664d262a4c4d5ac1021/boto3-1.16.47-py2.py3-none-any.whl (130kB)
[K     |████████████████████████████████| 133kB 26.2MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 27.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |

In [None]:
import csv
import numpy as np
from pytorch_transformers import RobertaTokenizer

num_layers = 12
d_model = 768
num_heads = 12
dff = d_model * 4
hidden_act = 'gelu'  # Use 'gelu' or 'relu'
dropout_rate = 0.1
layer_norm_eps = 1e-5
max_position_embed = 514
num_emotions = 41  # Number of emotion categories

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
vocab_size = tokenizer.vocab_size

max_length = 100  # Maximum number of tokens
buffer_size = 100000
batch_size = 1
num_epochs = 10
peak_lr = 2e-5
total_steps = 7000
warmup_steps = 700
adam_beta_1 = 0.9
adam_beta_2 = 0.98
adam_epsilon = 1e-6

checkpoint_path = 'emobert-checkpoints'  # Need to replace this with correct checkpoint path

SOS_ID = tokenizer.encode('<s>')[0]
EOS_ID = tokenizer.encode('</s>')[0]

emobert = EmoBERT(num_layers, d_model, num_heads, dff, hidden_act, dropout_rate,
            layer_norm_eps, max_position_embed, vocab_size, num_emotions)

build_model(emobert, max_length, vocab_size)

learning_rate = CustomSchedule(peak_lr, total_steps, warmup_steps)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1 = adam_beta_1, beta_2 = adam_beta_2,
            epsilon = adam_epsilon)
#train_loss = tf.keras.metrics.Mean(name = 'train_loss')

# Define the checkpoint manager.
ckpt = tf.train.Checkpoint(model = emobert, optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = None)

# If a checkpoint exists, restore the latest checkpoint.
print(ckpt_manager.latest_checkpoint)
#if ckpt_manager.latest_checkpoint:
#    ckpt.restore(ckpt_manager.latest_checkpoint)
#    print('Latest checkpoint restored!!')
#    f.write('Latest checkpoint restored!!\n')

# Restore the checkpoint at epoch 8 - v1.
# Restore the checkpoint at epoch 3 - v2.
# Restore the checkpoint at epoch 5 - v2.
print(ckpt_manager.checkpoints[4])
# ckpt.restore(ckpt_manager.checkpoints[4])
print('Checkpoint at epoch 5 restored!!')

100%|██████████| 898823/898823 [00:00<00:00, 1736669.87B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1319010.10B/s]


emobert-checkpoints/ckpt-10
emobert-checkpoints/ckpt-5
Checkpoint at epoch 5 restored!!


In [None]:
import tqdm
import numpy as np

def predict_emotion(uttrs):

    bs = 1
    
    #with open(join(data_xpath, 'uttrs.txt'), 'r') as f:
    #    uttrs = f.read().splitlines()

    uttr_ids = np.ones((len(uttrs), max_length), dtype = np.int32)
    #for i, u in tqdm(enumerate(uttrs), total = len(uttrs)):
    i = 0
    u = uttrs[0]
    u_ids = [SOS_ID] + tokenizer.encode(u)[:(max_length-2)] + [EOS_ID]
    uttr_ids[i, :len(u_ids)] = u_ids

    uttr_emots = np.zeros((len(uttrs), num_emotions))
    num_batches = len(uttrs) // bs
    #for i in tqdm(range(num_batches)):
    i = 0
    s = i * bs
    t = s + bs
    inp = tf.constant(uttr_ids[s:t])
    enc_padding_mask = create_masks(inp)
    pred = emobert(inp, False, enc_padding_mask)
    pred = tf.nn.softmax(pred).numpy()

    return pred[0]
    #np.save(join(data_path, 'uttr_emots.npy'), uttr_emots)

In [None]:
dialog = ["I 've got a bad feeling about this .",
"You see that dividing line ?  You dare stand near that dividing line ?",
"It looks dangerous .",
"Whimp !",
"Uh , you see ?  If you don 't go pass that line , there 's no problem .",
"Still dangerous .  I think we better go home .",
"You can 't .  Once you 've come in here , you cannot leave .  You must show your courage first .",
"And why do we have to test it ?",
"Because you are Rusty .  And rusty kids are losers .",
"And do you dare to ?",
"I do .  In that case , we go together .  Whoever gets closer , that 's the winner , and the loser must call the winner , \" Master \" .  Got it .  I think we should go be brave somewhere else .",
"You have to come with us .",
"I don 't want to be anyone 's master .",
"You sissy .",
"Hey !  Are you cheating ?",
"I 'm closer .  I 'm closer .  I 'm closer .  I 'm closer .  I 'm closer .  I 'm closer .  I 'm closer .  I 'm closer .  I 'm closer , closer , closer ..."]


for utterance in dialog:
    pred = predict_emotion([utterance])
    #print(pred)

    arr = np.array(pred)
    sorted_pred = np.sort(arr)[::-1]
    indices = arr.argsort()[-32:][::-1]

    #print(sorted_pred)
    #print(indices)
    label_pred = []
    for ind in indices:
    label_pred.append(ED_emotions[ind])
    #print(label_pred)

    print(utterance)
    print_str = ""
    print_str_2 = ""
    for i in range(len(label_pred)):
    print_str += label_pred[i] + " (" + str(sorted_pred[i]) + "), "
    print_str_2 += label_pred[i] + ", "
    print(print_str_2)
    print(print_str)
    print()

I 've got a bad feeling about this .
prepared, confident, agreeing, apprehensive, sad, sympathizing, anticipating, jealous, trusting, wishing, embarrassed, sentimental, disgusted, disappointed, furious, suggesting, terrified, content, encouraging, angry, impressed, devastated, anxious, ashamed, joyful, annoyed, faithful, guilty, afraid, caring, neutral, excited, 
prepared (0.1148149), confident (0.07197149), agreeing (0.046944674), apprehensive (0.04430049), sad (0.037095327), sympathizing (0.037022237), anticipating (0.0356097), jealous (0.033728626), trusting (0.028143365), wishing (0.027673488), embarrassed (0.027670458), sentimental (0.02680548), disgusted (0.026611352), disappointed (0.026011076), furious (0.025100425), suggesting (0.02333053), terrified (0.022534167), content (0.02207666), encouraging (0.021363156), angry (0.02080253), impressed (0.020738767), devastated (0.019967636), anxious (0.019266138), ashamed (0.018409066), joyful (0.018131724), annoyed (0.017553777), fait

### Load Conversation Data

In [None]:
!pip install swifter

Collecting swifter
[?25l  Downloading https://files.pythonhosted.org/packages/f4/3b/04bf42b94a22725241b47e0256458cde11f86f97572dd824e011f1ea8b20/swifter-1.0.7.tar.gz (633kB)
[K     |████████████████████████████████| 634kB 12.8MB/s eta 0:00:01
Collecting psutil>=5.6.6
[?25l  Downloading https://files.pythonhosted.org/packages/da/82/56cd16a4c5f53e3e5dd7b2c30d5c803e124f218ebb644ca9c30bc907eadd/psutil-5.8.0-cp36-cp36m-manylinux2010_x86_64.whl (291kB)
[K     |████████████████████████████████| 296kB 24.7MB/s 
Collecting modin[ray]>=0.8.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/ab/a9/ead212fa94de8f14459e22b0604df9c84ff704e986b58e70396ba47668f2/modin-0.8.2-py3-none-manylinux1_x86_64.whl (533kB)
[K     |████████████████████████████████| 542kB 31.5MB/s 
Collecting fsspec>=0.6.0; extra == "dataframe"
[?25l  Downloading https://files.pythonhosted.org/packages/ec/80/72ac0982cc833945fada4b76c52f0f65435ba4d53bc9317d1c70b5f7e7d5/fsspec-0.8.5-py3-none-any.whl (98kB)
[K     

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
import multiprocess as mp

%matplotlib inline

In [None]:
# load dyadic data
sad_dyadic_convs_clean = pd.read_csv('sad_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
depression_dyadic_convs_clean = pd.read_csv('depression_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
depressed_dyadic_convs_clean = pd.read_csv('depressed_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
offmychest_dyadic_convs_clean = pd.read_csv('offmychest_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
suicidewatch_dyadic_convs_clean = pd.read_csv('suicidewatch_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
depression_help_dyadic_convs_clean = pd.read_csv('depression_help_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
anxietyhelp_dyadic_convs_clean = pd.read_csv('anxietyhelp_dyadic_convs_clean_sentiment.csv', lineterminator='\n')
mentalhealthsupport_dyadic_convs_clean = pd.read_csv('mentalhealthsupport_dyadic_convs_clean_sentiment.csv', lineterminator='\n')

In [None]:
# load multiparty data
sad_multi_convs_clean = pd.read_csv('sad_multi_convs_clean_sentiment.csv', lineterminator='\n')
depression_multi_convs_clean = pd.read_csv('depression_multi_convs_clean_sentiment.csv', lineterminator='\n')
depressed_multi_convs_clean = pd.read_csv('depressed_multi_convs_clean_sentiment.csv', lineterminator='\n')
offmychest_multi_convs_clean = pd.read_csv('offmychest_multi_convs_clean_sentiment.csv', lineterminator='\n')
suicidewatch_multi_convs_clean = pd.read_csv('suicidewatch_multi_convs_clean_sentiment.csv', lineterminator='\n')
depression_help_multi_convs_clean = pd.read_csv('depression_help_multi_convs_clean_sentiment.csv', lineterminator='\n')
anxietyhelp_multi_convs_clean = pd.read_csv('anxietyhelp_multi_convs_clean_sentiment.csv', lineterminator='\n')
mentalhealthsupport_multi_convs_clean = pd.read_csv('mentalhealthsupport_multi_convs_clean_sentiment.csv', lineterminator='\n')

In [None]:
def emobert_predict_emotion(text):

    # predict emotion using emoBERT model
    pred = predict_emotion([text])

    # catch the emotion label having the largest prediction probability
    arr = np.array(pred)
    sorted_pred = np.sort(arr)[::-1]
    indices = arr.argsort()[-32:][::-1]
    emotion_pred = ED_emotions[indices[0]]

    return emotion_pred

def apply_parallel(grouped_df, func):
    retLst = Parallel(n_jobs = mp.cpu_count())(delayed(func)(group) for id, group in grouped_df)
    return pd.concat(retLst)

def emobert_predict_emotion_df(df_convs):
    
    # apply emotion prediction function on the text column in dataframe
    df_convs['text'] = df_convs['text'].astype(str)
    df_convs['emotion prediction'] = df_convs['text'].swifter.apply(emobert_predict_emotion)

    return df_convs

def emobert_predict_emotion_df_parallel(df_convs):
    df_convs_emotion = apply_parallel(df_convs.groupby(df_convs['conversation id']), emobert_predict_emotion_df)

    return df_convs_emotion

In [None]:
# emotion prediction (dyadic)
sad_dyadic_convs_clean_emotion = emobert_predict_emotion_df(sad_dyadic_convs_clean)
depression_dyadic_convs_clean_emotion = emobert_predict_emotion_df(depression_dyadic_convs_clean)
depressed_dyadic_convs_clean_emotion = emobert_predict_emotion_df(depressed_dyadic_convs_clean)
offmychest_dyadic_convs_clean_emotion = emobert_predict_emotion_df(offmychest_dyadic_convs_clean)
suicidewatch_dyadic_convs_clean_emotion = emobert_predict_emotion_df(suicidewatch_dyadic_convs_clean)
depression_help_dyadic_convs_clean_emotion = emobert_predict_emotion_df(depression_help_dyadic_convs_clean)
anxietyhelp_dyadic_convs_clean_emotion = emobert_predict_emotion_df(anxietyhelp_dyadic_convs_clean)
mentalhealthsupport_dyadic_convs_clean_emotion = emobert_predict_emotion_df(mentalhealthsupport_dyadic_convs_clean)

In [None]:
# emotion prediction (multiparty)
sad_multi_convs_clean_emotion = emobert_predict_emotion_df(sad_multi_convs_clean)
depression_multi_convs_clean_emotion = emobert_predict_emotion_df(depression_multi_convs_clean)
depressed_multi_convs_clean_emotion = emobert_predict_emotion_df(depressed_multi_convs_clean)
offmychest_multi_convs_clean_emotion = emobert_predict_emotion_df(offmychest_multi_convs_clean)
suicidewatch_multi_convs_clean_emotion = emobert_predict_emotion_df(suicidewatch_multi_convs_clean)
depression_help_multi_convs_clean_emotion = emobert_predict_emotion_df(depression_help_multi_convs_clean)
anxietyhelp_multi_convs_clean_emotion = emobert_predict_emotion_df(anxietyhelp_multi_convs_clean)
mentalhealthsupport_multi_convs_clean_emotion = emobert_predict_emotion_df(mentalhealthsupport_multi_convs_clean)

In [None]:
# save (dyadic)
sad_dyadic_convs_clean_emotion.to_csv('sad_dyadic_convs_clean_emotion.csv', index = False)
depression_dyadic_convs_clean_emotion.to_csv('depression_dyadic_convs_clean_emotion.csv', index = False)
depressed_dyadic_convs_clean_emotion.to_csv('depressed_dyadic_convs_clean_emotion.csv', index = False)
offmychest_dyadic_convs_clean_emotion.to_csv('offmychest_dyadic_convs_clean_emotion.csv', index = False)
suicidewatch_dyadic_convs_clean_emotion.to_csv('suicidewatch_dyadic_convs_clean_emotion.csv', index = False)
depression_help_dyadic_convs_clean_emotion.to_csv('depression_help_dyadic_convs_clean_emotion.csv', index = False)
anxietyhelp_dyadic_convs_clean_emotion.to_csv('anxietyhelp_dyadic_convs_clean_emotion.csv', index = False)
mentalhealthsupport_dyadic_convs_clean_emotion.to_csv('mentalhealthsupport_dyadic_convs_clean_emotion.csv', index = False)

In [None]:
# save (multiparty)
sad_multi_convs_clean_emotion.to_csv('sad_multi_convs_clean_emotion.csv', index = False)
depression_multi_convs_clean_emotion.to_csv('depression_multi_convs_clean_emotion.csv', index = False)
depressed_multi_convs_clean_emotion.to_csv('depressed_multi_convs_clean_emotion.csv', index = False)
offmychest_multi_convs_clean_emotion.to_csv('offmychest_multi_convs_clean_emotion.csv', index = False)
suicidewatch_multi_convs_clean_emotion.to_csv('suicidewatch_multi_convs_clean_emotion.csv', index = False)
depression_help_multi_convs_clean_emotion.to_csv('depression_help_multi_convs_clean_emotion.csv', index = False)
anxietyhelp_multi_convs_clean_emotion.to_csv('anxietyhelp_multi_convs_clean_emotion.csv', index = False)
mentalhealthsupport_multi_convs_clean_emotion.to_csv('mentalhealthsupport_multi_convs_clean_emotion.csv', index = False)