### Add encoded hard and soft labels to CSVs

Run all cells in order (don't be alarmed if the final cells take a couple of hours!)

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import sklearn
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from scipy.special import softmax
import time
import nltk

Load Dataset

In [37]:
def load_dataset():
    """
    Load training and test sets
    """
    
    # TODO: Yufeng, add the correct path to the data here
    train = pd.read_csv("./dataset_twitter/train.csv")
    test = pd.read_csv("./dataset_twitter/test.csv")
    
    return train, test

Encoding Hard Labels and Adding to Train, Dev, and Test CSVs

In [38]:
def encode_labels_one_hot(Y):
    """
    For each row in input Y, converts labels 0, 1, and 2 to arrays
    that are one-hot encoded
    """
    encoded_Y = np.zeros((Y.shape[0],3))
    
    for index, row in enumerate(Y):
        one_hot_array = np.zeros(3)
        one_hot_array[row] = 1
        encoded_Y[index] = one_hot_array
    
    return encoded_Y

In [None]:
# UNIT TEST encode_labels_one_hot

Y = Y_train[0:5]
print(Y)
new_Y = encode_labels_one_hot(Y)
print(new_Y)

In [40]:
train, test = load_dataset()

train["Y_hard"] = encode_labels_one_hot(train["label"]).tolist()
test["Y_hard"] = encode_labels_one_hot(test["label"]).tolist()

Train-Dev-Test Split

In [119]:
train, dev = train_test_split(train, test_size=0.3, random_state=42)

Generating Soft Labels from the Teacher Model

In [127]:
def get_softlabels(X):
    """
    Citation: Full Classification Example on twitter-roberta-base-sentiment-latest Model card. 
    Link: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latesthttps://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest
    
    Runs Twitter-Roberta-Based-Sentiment Model on the Twitter Sentiment Extraction dataset
    Returns an array of soft labels (log probabilities)
    """
    # Preprocess text (username and link placeholders)
    def preprocess(text):
        text = str(text)
        new_text = []
        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    
    # Pytorch
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # dimensions: [# examples, # classes (3)]
    soft_labels = np.zeros((X.shape[0],3))
    
    for index, row in enumerate(X):
        text = preprocess(row)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        soft_labels[index] = scores.tolist()
        index += 1

    return soft_labels

Adding Soft Labels to Train Dataframe

In [125]:
# UNIT TEST, get_softlabels()
mini_train = train[0:5]
print(mini_train.shape)

mini_train["softies"] = get_softlabels(mini_train["text"]).tolist()
mini_train.to_csv("softies.csv", index=False)

(5, 8)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(5,)
INDEX:  0
INDEX:  1
INDEX:  2
INDEX:  3
INDEX:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_train["softies"] = get_softlabels(mini_train["text"]).tolist()


In [128]:
# Add soft labels to train dataframe
print(train.shape)
train["Y_soft"] = get_softlabels(train["text"]).tolist()

(19236, 8)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total time:  1947.9866590499878


KeyboardInterrupt: 

In [None]:
# Create CSVS from updated DataFrames
train.to_csv("train_preprocessed.csv", index=False)
dev.to_csv("val_preprocessed.csv", index=False)
test.to_csv("test_preprocessed.csv", index=False)

In [130]:
# Sanity Check
train.shape
dev.shape
test.shape

(19236, 9)