In [24]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import nltk
import contractions
from datasets import load_dataset

In [25]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
dataset = load_dataset('sem_eval_2018_task_1','subtask5.english')
dataset = dataset.rename_column("Tweet", "text")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})


In [27]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.remove("not")

def clean_text(text):
    try:
        text = contractions.fix(text)  # remove contractions such as haven't to have not
    except:
        pass
    text = text.lower()  # convert text to lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'https?://\S+|www\.\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove accented characters such as cafe`
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)  # Remove emojis
    text = re.sub(r'(\w)\1{2,}', r'\1', text)  # Remove consecutive repeated characters if repeated 3 or more times
    text = re.sub(r'\W', ' ', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text)  # remove extra white spaces
    word_tokens = word_tokenize(text)  # tokenize text
    filtered_text = [lemmatizer.lemmatize(w) for w in word_tokens if not w in stop_words]  # remove stopwords and lemmatize
    return ' '.join(filtered_text)

In [28]:
dataset = dataset.map(lambda example: {'text': clean_text(example['text'])})

Map:   0%|          | 0/6838 [00:00<?, ? examples/s]

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

In [29]:
# print first 5 rows of cleaned_text column
print(dataset['train'][:5])

{'ID': ['2017-En-21441', '2017-En-31535', '2017-En-21068', '2017-En-31436', '2017-En-22195'], 'text': ['worry payment problem may never joyce meyer motivation leadership worry', 'whatever decide make sure make happy', 'max_kellerman also help majority nfl coaching inept bill brien play calling wow gopats', 'accept challenge literally even feel exhilaration victory george patton', 'roommate okay not spell autocorrect terrible firstworldprobs'], 'anger': [False, False, True, False, True], 'anticipation': [True, False, False, False, False], 'disgust': [False, False, True, False, True], 'fear': [False, False, False, False, False], 'joy': [False, True, True, True, False], 'love': [False, True, False, False, False], 'optimism': [True, True, True, True, False], 'pessimism': [False, False, False, False, False], 'sadness': [False, False, False, False, False], 'surprise': [False, False, False, False, False], 'trust': [True, False, False, False, False]}


In [30]:
# count empty rows after cleaning
print(dataset['train'].filter(lambda example: example['text'] == ''))

Filter:   0%|          | 0/6838 [00:00<?, ? examples/s]

Dataset({
    features: ['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
    num_rows: 0
})


### Preprocessing the dataset.
- The dataset must have the label column named as 'label', to be trained using Trainer API. It is preferable to keep the text column as 'text'. Since we tokenise text column, it can be named anything, as we just pass the tokenised columns input_ids and attention_mask to the model.

In [31]:
column_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
def convert_to_labels(example):
    labels = []
    for col in column_names:
        if example[col] == True:
            labels.append(1)
        else:
            labels.append(0)
    return {'labels': labels}

dataset = dataset.map(convert_to_labels)

Map:   0%|          | 0/6838 [00:00<?, ? examples/s]

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

In [None]:
print(dataset['train'][:5])

In [32]:
dataset.push_to_hub("sem_eval_2018_task_1_english_cleaned_labels")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]