# **Emotion-Ease**
### Get The dataset and structure it as required


In [1]:
! cd /content
# go-emotions dataset
print("Downloading Goemotions Dataset.......\n")
! gsutil cp -r gs://gresearch/goemotions/data/full_dataset/ .

# sarcasm dataset <intended, percieved and more...>

print("\n\n\nDownloading iSarcasm Dataset......\n")
! wget https://raw.githubusercontent.com/AmirAbaskohi/SemEval2022-Task6-Sarcasm-Detection/main/Data/Train_Dataset.csv
! mv "/content/Train_Dataset.csv" "/content/full_dataset"

! wget https://raw.githubusercontent.com/AmirAbaskohi/SemEval2022-Task6-Sarcasm-Detection/main/Data/Test_Dataset.csv
! mv "/content/Test_Dataset.csv" "/content/full_dataset"

print("\n\n\nDownloading News Headline Sarcasm Dataset......\n")
! wget https://raw.githubusercontent.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/master/Sarcasm_Headlines_Dataset.json
! mv "/content/Sarcasm_Headlines_Dataset.json" "/content/full_dataset"


print("\n\n\nDownloading Reddit Sarcasm Dataset......\n")
! wget https://raw.githubusercontent.com/NeerajG03/EmotionEase_AndroidService/main/Data/reddit_sarcasm_raw/GEN-sarc-notsarc.csv -P /content/full_dataset/
! wget https://raw.githubusercontent.com/NeerajG03/EmotionEase_AndroidService/main/Data/reddit_sarcasm_raw/HYP-sarc-notsarc.csv -P /content/full_dataset/
! wget https://raw.githubusercontent.com/NeerajG03/EmotionEase_AndroidService/main/Data/reddit_sarcasm_raw/RQ-sarc-notsarc.csv -P  /content/full_dataset/



Downloading Goemotions Dataset.......

Copying gs://gresearch/goemotions/data/full_dataset/goemotions_1.csv...
Copying gs://gresearch/goemotions/data/full_dataset/goemotions_2.csv...
Copying gs://gresearch/goemotions/data/full_dataset/goemotions_3.csv...
/ [3 files][ 40.8 MiB/ 40.8 MiB]                                                
Operation completed over 3 objects/40.8 MiB.                                     



Downloading iSarcasm Dataset......

--2023-04-23 07:20:55--  https://raw.githubusercontent.com/AmirAbaskohi/SemEval2022-Task6-Sarcasm-Detection/main/Data/Train_Dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 648874 (634K) [text/plain]
Saving to: ‘Train_Dataset.csv’


2023-04-23 07:20:56 (41.2 MB/s) - ‘Train_Dataset.csv’ saved [648874

In [2]:
! mkdir /content/raw_datasets

### Dataset Preparation for Emotion Detection

In [3]:
import pandas as pd

# to create a new dataset with a class column based of the one hot encoding
def create_label(dataset):
  df = pd.read_csv(f'/content/full_dataset/{dataset}')
  max_cols = df.iloc[:, 9:].idxmax(axis=1)
  df['class'] = max_cols.apply(lambda x: x)
  false_rows = df[df.iloc[:, 8] == False]
  true_rows = df[df.iloc[:, 8] == True]
  print(f"There are {len(false_rows)} rows with 'false'  and {len(true_rows)} 'true' value in the unclear column in {dataset} dataset.")
  df = df.drop(df[df['example_very_unclear'] == True].index)
  df = df.iloc[:, [0,-1]]
  df.to_csv(f'/content/full_dataset/w-class_{dataset}', index=False)

# merge similar csvs
def merge_csv_files(file_paths, output_file):
    dataframes = [pd.read_csv(file_path) for file_path in file_paths]
    merged_dataframe = pd.concat(dataframes, ignore_index=True)
    merged_dataframe.to_csv(output_file, index=False)
    print(f"Merged {len(file_paths)} CSV files into {output_file}")
    print(f"Created -> {output_file.split('/')[-1]}")

In [4]:
merge_csv_files(['/content/full_dataset/GEN-sarc-notsarc.csv','/content/full_dataset/HYP-sarc-notsarc.csv','/content/full_dataset/RQ-sarc-notsarc.csv'],'/content/raw_datasets/redditsarcasm.csv')

Merged 3 CSV files into /content/raw_datasets/redditsarcasm.csv
Created -> redditsarcasm.csv


In [5]:
reddit_df = pd.read_csv('/content/raw_datasets/redditsarcasm.csv')
reddit_df.iloc[:, 0] = reddit_df.iloc[:, 0].replace('sarc', 'sarcasm')
reddit_df.iloc[:, 0] = reddit_df.iloc[:, 0].replace('notsarc', 'not_sarcasm')
reddit_df = reddit_df.iloc[:, [2, 0]]

In [6]:
goemotion_datasets = ['goemotions_1.csv','goemotions_2.csv','goemotions_3.csv']
for each in goemotion_datasets:
    create_label(each)

merge_csv_files([f'/content/full_dataset/w-class_{each}' for each in goemotion_datasets],'/content/raw_datasets/goemotions.csv')

There are 68871 rows with 'false'  and 1129 'true' value in the unclear column in goemotions_1.csv dataset.
There are 68907 rows with 'false'  and 1093 'true' value in the unclear column in goemotions_2.csv dataset.
There are 70036 rows with 'false'  and 1189 'true' value in the unclear column in goemotions_3.csv dataset.
Merged 3 CSV files into /content/raw_datasets/goemotions.csv
Created -> goemotions.csv


### Dataset Preparation for Sarcasm Detection

In [7]:
merge_csv_files(['/content/full_dataset/Train_Dataset.csv','/content/full_dataset/Test_Dataset.csv'], '/content/full_dataset/isarcasm.csv')

Merged 2 CSV files into /content/full_dataset/isarcasm.csv
Created -> isarcasm.csv


In [8]:
sarcasm_df = pd.read_csv(f'/content/full_dataset/isarcasm.csv')
sarcasm_df = sarcasm_df.rename(columns={'tweet': 'text', 'sarcastic': 'class'})
sarcasm_df.iloc[:, 1] = sarcasm_df.iloc[:, 1].replace(1, 'sarcasm')
sarcasm_df.iloc[:, 1] = sarcasm_df.iloc[:, 1].replace(0, 'not_sarcasm')
sarcasm_df['text'] = sarcasm_df['text'].str.replace(r'\s*[@#]\S*\s*', ' ', regex=True)
sarcasm_df = sarcasm_df[sarcasm_df['text']!='']
print(f'\nNumber of entries in the isarcasm dataset = {len(sarcasm_df)}')


Number of entries in the isarcasm dataset = 8334


In [13]:
sarcasm_df

Unnamed: 0,text,class
0,The only thing I got from college is a caffein...,sarcasm
1,I love it when professors draw a big question ...,sarcasm
2,Remember the hundred emails from companies whe...,sarcasm
3,Today my pop-pop told me I was not “forced” to...,sarcasm
4,"I did too, and I also reported Cancun Cruz ...",sarcasm
...,...,...
8329,I’ve just seen this and felt it deserved a Ret...,not_sarcasm
8330,Omg how an earth is that a pen !!! 🤡,not_sarcasm
8331,Bringing Kanye and drake to a tl near you,not_sarcasm
8332,"I love it when women are referred to as ""girl ...",sarcasm


In [9]:
import json

with open('/content/full_dataset/Sarcasm_Headlines_Dataset.json') as f:
    data = []
    for line in f:
      json_data = json.loads(line)
      data.append(json_data)

In [10]:
newsarcasm_df = pd.DataFrame(data)
newsarcasm_df = newsarcasm_df.rename(columns={'is_sarcastic': 'class', 'headline': 'text', 'article_link': 'link'})
newsarcasm_df = newsarcasm_df.drop('link', axis=1)
newsarcasm_df = newsarcasm_df.iloc[:, [1, 0]]
newsarcasm_df.iloc[:, 1] = newsarcasm_df.iloc[:, 1].replace(1, 'sarcasm')
newsarcasm_df.iloc[:, 1] = newsarcasm_df.iloc[:, 1].replace(0, 'not_sarcasm')
print(f'\nNumber of entries in the news-sarcasm dataset = {len(newsarcasm_df)}')


Number of entries in the news-sarcasm dataset = 28619


In [11]:
# Adding handcrafted dataset for training as well
! wget https://raw.githubusercontent.com/NeerajG03/EmotionEase_AndroidService/main/Data/custom_dataset.csv -P  /content/full_dataset/
cust_df = pd.read_csv(f'/content/full_dataset/custom_dataset.csv', encoding='cp1252')

sarcasm_final = pd.concat([sarcasm_df,newsarcasm_df, reddit_df,cust_df], ignore_index=True)
sarcasm_final = sarcasm_final.sample(frac=1).reset_index(drop=True)
sarcasm_final.to_csv('/content/raw_datasets/sarcasm.csv', index=False)
# sarcasm_final.to_csv('/content/raw_datasets/sarcasm.csv', index=True)

--2023-04-23 07:21:32--  https://raw.githubusercontent.com/NeerajG03/EmotionEase_AndroidService/main/Data/custom_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88862 (87K) [text/plain]
Saving to: ‘/content/full_dataset/custom_dataset.csv’


2023-04-23 07:21:32 (18.5 MB/s) - ‘/content/full_dataset/custom_dataset.csv’ saved [88862/88862]



In [12]:
print(f'Size of the Sarcasm dataset is {len(sarcasm_final)}')

Size of the Sarcasm dataset is 47495


In [None]:
#split sarcasm data into training and testing
''' 
# !pip install numpy ==1.21
# !pip uninstall numpy
# !pip uninstall sklearn
'''

!pip install scikit-learn --upgrade

from sklearn.model_selection import train_test_split

data = pd.read_csv('/content/raw_datasets/sarcasm.csv')
train_data, test_data = train_test_split(data, test_size=0.2)
train_data.to_csv('/content/raw_datasets/train_sarcasm.csv', index=False)
test_data.to_csv('/content/raw_datasets/test_sarcasm.csv', index=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Model Creation

In [None]:
# ! sudo apt -y install libportaudio2
! pip install  tflite-model-maker
! pip install --upgrade tflite-model-makery
! pip uninstall tflite_support_nightly
! pip install tflite_support_nightly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tflite-model-maker
  Downloading tflite_model_maker-0.4.2-py3-none-any.whl (577 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m577.3/577.3 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numba==0.53
  Downloading numba-0.53.0-cp39-cp39-manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece>=0.1.91
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib<3.5.0,>=3.0.3
  Downloading matplotlib-3.4.3-cp39-cp39-manylinux1_x86_64.whl (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m58.7 MB/s[0

In [None]:
# create directories for storing exported model data
!mkdir /content/models
!mkdir /content/models/average_word_vec
!mkdir /content/models/mobilebert_classifier

In [None]:
import numpy as np
import os

!pip install numpy --upgrade
!pip install tensorflow-addons --upgrade

from tflite_model_maker import model_spec
from tflite_model_maker import text_classifier
from tflite_model_maker.config import ExportFormat
from tflite_model_maker.text_classifier import AverageWordVecSpec
from tflite_model_maker.text_classifier import DataLoader

from tflite_support.task import core
from tflite_support.task import processor
from tflite_support.task import text

import tensorflow as tf
assert tf.__version__.startswith('2')
tf.get_logger().setLevel('ERROR')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.8.4 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


## Building the model for Sarcasm Detection

In [None]:
# average_word_vec model

spec = model_spec.get('average_word_vec')

train_data = DataLoader.from_csv(
      filename='/content/raw_datasets/train_sarcasm.csv',
      text_column='text',
      label_column='class',
      model_spec=spec,
      is_training=True)
test_data = DataLoader.from_csv(
      filename='/content/raw_datasets/test_sarcasm.csv',
      text_column='text',
      label_column='class',
      model_spec=spec,
      is_training=False)


model = text_classifier.create(train_data, model_spec=spec, epochs=10)
loss, acc = model.evaluate(test_data)

# export the model
model.export(export_dir='/content/models/average_word_vec/')
model.export(export_dir='/content/models/average_word_vec/',export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])
# model.export(export_dir='/content/models/average_word_vec/',export_format=[ExportFormat.SAVED_MODEL, ExportFormat.LABEL, ExportFormat.VOCAB])

Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7
Epoch 8/8
Epoch 9/9

KeyboardInterrupt: ignored

In [None]:
# mobilebert_classifier model

from tflite_model_maker.text_classifier import BertClassifierSpec

mb_spec = model_spec.get('mobilebert_classifier')

train_data = DataLoader.from_csv( 
      filename='/content/raw_datasets/train_sarcasm.csv',
      text_column='text',
      label_column='class',
      model_spec=mb_spec,
      is_training=True)
test_data = DataLoader.from_csv(
      filename='/content/raw_datasets/test_sarcasm.csv',
      text_column='text',
      label_column='class',
      model_spec=mb_spec,
      is_training=False)

model = text_classifier.create(train_data, model_spec=mb_spec, epochs=5)
loss, acc = model.evaluate(test_data)

# export the model mobilebert_classifier
model.export(export_dir='/content/models/mobilebert_classifier/')
model.export(export_dir='/content/models/mobilebert_classifier/',export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])

In [None]:
# saving the metadata for the mobilebert_classifier model
from tflite_support.metadata_writers import nl_classifier
from tflite_support.metadata_writers import metadata_info
from tflite_support.metadata_writers import writer_utils

NLClassifierWriter = nl_classifier.MetadataWriter
_MODEL_PATH = "/content/models/mobilebert_classifier/model.tflite"
_LABEL_FILE = "/content/models/mobilebert_classifier/labels.txt"
_VOCAB_FILE = "/content/models/mobilebert_classifier/vocab.txt"
_DELIM_REGEX_PATTERN = r"[^\w\']+"
_SAVE_TO_PATH = "/content/models/mobilebert_classifier/metadata"

writer = nl_classifier.MetadataWriter.create_for_inference(
    writer_utils.load_file(_MODEL_PATH),
    metadata_info.RegexTokenizerMd(_DELIM_REGEX_PATTERN, _VOCAB_FILE),
    [_LABEL_FILE])

print(writer.get_metadata_json())

writer_utils.save_file(writer.populate(), _SAVE_TO_PATH)

In [None]:
from tflite_support.task import text

# Initialization
classifier = text.BertNLClassifier.create_from_file('/content/models/mobilebert_classifier/model.tflite')

# Run inference
sentence = "my name is skylar white yo, my husband is walter white yo! uhuh"
text_classification_result = classifier.classify(sentence)
# print(text_classification_result)
text_classification_result = text_classification_result.classifications[0].categories

print('Sentence :', sentence)
max_prob = max(text_classification_result[0].score, text_classification_result[1].score)
# print(text_classification_result)
print('Result :', [result.category_name for result in text_classification_result if result.score == max_prob][0])

## Emotions

In [None]:
# label the data with numeric encoding
# encoding_dict = {}

# for index,label in enumerate(data['class'].unique()):
#   encoding_dict[label] = index

# data['class'] = data['class'].map(lambda x: encoding_dict[x])

# print(data.head())
data = pd.read_csv('/content/raw_datasets/goemotions.csv')

# # split goemotions data into training and testing
train_data, test_data = train_test_split(data, test_size=0.2)
train_data.to_csv('/content/raw_datasets/train_emotions.csv', index=False)
test_data.to_csv('/content/raw_datasets/test_emotions.csv', index=False)

In [None]:
!mkdir /content/models/emotions/mobilebert_classifier/

In [None]:
# mobilebert_classifier model

from tflite_model_maker.text_classifier import BertClassifierSpec

mb_spec = model_spec.get('mobilebert_classifier')

train_data = DataLoader.from_csv( 
      filename='/content/raw_datasets/train_emotions.csv',
      text_column='text',
      label_column='class',
      model_spec=mb_spec,
      is_training=True)
test_data = DataLoader.from_csv(
      filename='/content/raw_datasets/test_emotions.csv',
      text_column='text',
      label_column='class',
      model_spec=mb_spec,
      is_training=False)

model = text_classifier.create(train_data, model_spec=mb_spec, epochs=5)
loss, acc = model.evaluate(test_data)

# export the model mobilebert_classifier
model.export(export_dir='/content/models/emotions/mobilebert_classifier/')
model.export(export_dir='/content/models/emotions/mobilebert_classifier/',export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])

## Bert Model for Sarcasm Dataset

In [None]:
!pip install -U "tensorflow==2.8.*"
!pip install -U "tensorflow-text==2.8.*"
import tensorflow as tf
import tensorflow_text as text

In [None]:
import tensorflow_hub as hub


bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
df = pd.read_csv('/content/raw_datasets/sarcasm.csv')

df['class']=df['class'].apply(lambda x: 1 if x=='sarcasm' else 0)


X_train, X_test, y_train, y_test = train_test_split(df['text'],df['class'], stratify=df['class'])
X_train.head()


In [None]:
model.fit(X_train, y_train, epochs=2, batch_size = 32)

## Avg Word Vec Model

In [None]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('/content/raw_datasets/goemotions.csv')
train_data, test_data = train_test_split(data, test_size=0.2)
train_data.to_csv('/content/raw_datasets/train_goemotions.csv', index=False)
test_data.to_csv('/content/raw_datasets/test_goemotions.csv', index=False)

In [None]:
spec = model_spec.get('average_word_vec')

train_data = DataLoader.from_csv(
      filename='/content/raw_datasets/train_goemotions.csv',
      text_column='text',
      label_column='class',
      model_spec=spec,
      is_training=True)
test_data = DataLoader.from_csv(
      filename='/content/raw_datasets/test_goemotions.csv',
      text_column='text',
      label_column='class',
      model_spec=spec,
      is_training=False)



In [None]:

model = text_classifier.create(train_data, model_spec=spec, epochs=10, batch_size=512)
loss, acc = model.evaluate(test_data)

# export the model
model.export(export_dir='/content/models/average_word_vec/')
model.export(export_dir='/content/models/average_word_vec/',export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])
# model.export(export_dir='/content/models/average_word_vec/',export_format=[ExportFormat.SAVED_MODEL, ExportFormat.LABEL, ExportFormat.VOCAB])

## Bert TfLite GoEmotions

In [None]:
# mobilebert_classifier model

from tflite_model_maker.text_classifier import BertClassifierSpec

mb_spec = model_spec.get('bert_classifier')

train_data = DataLoader.from_csv( 
      filename='/content/raw_datasets/train_goemotions.csv',
      text_column='text',
      label_column='class',
      model_spec=mb_spec,
      is_training=True)
test_data = DataLoader.from_csv(
      filename='/content/raw_datasets/test_goemotions.csv',
      text_column='text',
      label_column='class',
      model_spec=mb_spec,
      is_training=False)

In [None]:
model = text_classifier.create(train_data, model_spec=mb_spec,model_dir='.' , epochs=5)
loss, acc = model.evaluate(test_data)

# export the model mobilebert_classifier
# model.export(export_dir='/content/models/mobilebert_classifier/')
# model.export(export_dir='/content/models/mobilebert_classifier/',export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])

## DistilBERT - Emotion (Dev)

In [None]:
!pip install transformers==4.12.5 pandas torch
import transformers

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel, DistilBertModel, DistilBertForSequenceClassification,
                          TrainingArguments, Trainer)
from transformers.modeling_outputs import SequenceClassifierOutput

!pip install -q datasets

from datasets import load_dataset
emotions = load_dataset("go_emotions", "raw")

device = torch.device("cuda")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.12.5
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m70.0 MB/s[0m eta 

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.11k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/raw to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
df = emotions['train'].to_pandas()
label_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
len(label_cols)
id2label = {str(i):label for i, label in enumerate(label_cols)}
label2id = {label:str(i) for i, label in enumerate(label_cols)}
df["labels"] = df[label_cols].values.tolist()
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,labels
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# take sample for quick prototyping
df_sample = df.sample(n=1000)

# create train / test splits
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

train_encodings = tokenizer(df_train["text"].values.tolist(), truncation=True)
test_encodings = tokenizer(df_test["text"].values.tolist(), truncation=True)

train_labels = df_train["labels"].values.tolist()
test_labels = df_test["labels"].values.tolist()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
class GoEmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GoEmotionDataset(train_encodings, train_labels)
test_dataset = GoEmotionDataset(test_encodings, test_labels)

In [None]:
# sanity check
tokenizer.decode(train_dataset[0]["input_ids"])

'[CLS] that game hurt. [SEP]'

In [None]:
# Fine-tuning
class DistilBertForMultilabelSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)

        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)
        


In [None]:
def get_default_device():
    """Picking GPU if available or else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda:0')
    else:
        return torch.device('cpu')
device = get_default_device()
print(device)

cuda:0


In [None]:
num_labels=28
model = DistilBertForMultilabelSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).cuda()

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForMultilabelSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForMultilabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bia

In [None]:
model.config.id2label = {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarrassment",
    "13": "excitement",
    "14": "fear",
    "15": "gratitude",
    "16": "grief",
    "17": "joy",
    "18": "love",
    "19": "nervousness",
    "20": "optimism",
    "21": "pride",
    "22": "realization",
    "23": "relief",
    "24": "remorse",
    "25": "sadness",
    "26": "surprise",
    "27": "neutral"
  },
model.config.label2id ={
    "admiration": 0,
    "amusement": 1,
    "anger": 2,
    "annoyance": 3,
    "approval": 4,
    "caring": 5,
    "confusion": 6,
    "curiosity": 7,
    "desire": 8,
    "disappointment": 9,
    "disapproval": 10,
    "disgust": 11,
    "embarrassment": 12,
    "excitement": 13,
    "fear": 14,
    "gratitude": 15,
    "grief": 16,
    "joy": 17,
    "love": 18,
    "nervousness": 19,
    "neutral": 27,
    "optimism": 20,
    "pride": 21,
    "realization": 22,
    "relief": 23,
    "remorse": 24,
    "sadness": 25,
    "surprise": 26
  }

In [None]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

batch_size = 32
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="emotion",
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=logging_steps
)

trainer = Trainer(
    model.cuda(),
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# for obj in gc.get_objects():
#     try:
#         if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data) and obj.device=='cpu'):
#             obj.to('cuda')
#             print(get_var_name(id(obj)), obj.device)
#     except:
#         pass

In [None]:
# sanity check that we can run evaluation
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 42234
  Batch size = 32


RuntimeError: ignored

In [None]:
trainer.train()

In [None]:
trainer.evaluate()