Pre-processing

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import sklearn
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from scipy.special import softmax
import time
import nltk

In [3]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [9]:
a = tokenizer("This is cool", return_tensors='pt')
a

{'input_ids': tensor([[   0,  713,   16, 3035,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

Load Dataset

In [37]:
def load_dataset():
    """
    Load training and test sets
    """

    train = pd.read_csv("./dataset_twitter/train.csv")
    test = pd.read_csv("./dataset_twitter/test.csv")
    
    return train, test

Encoding Hard Labels and Adding to Train, Dev, and Test CSVs

In [38]:
def encode_labels_one_hot(Y):
    """
    For each row in input Y, converts labels 0, 1, and 2 to arrays
    that are one-hot encoded
    """
    encoded_Y = np.zeros((Y.shape[0],3))
    
    for index, row in enumerate(Y):
        one_hot_array = np.zeros(3)
        one_hot_array[row] = 1
        encoded_Y[index] = one_hot_array
    
    return encoded_Y

In [40]:
train, test = load_dataset()

train["Y_hard"] = encode_labels_one_hot(train["label"]).tolist()
test["Y_hard"] = encode_labels_one_hot(test["label"]).tolist()

Tokenizing Input Text and Adding to Train and Dev Dataframes

In [55]:
def tokenizer(text: str):
    '''
    Citation: This tokenizer function & regex rule is borrowed from Katie's tokenizer regex demo at:
    https://www.cs.williams.edu/~kkeith/teaching/s23/cs375/attach/tokenization_regex_demo.html
    This helper function takes a string and returns a list of tokenized strings.
    '''
    regex = r"[A-Za-z]+|\$[\d\.]+|\S+" 
    res = nltk.regexp_tokenize(text, regex)
    return res

Embedding Input Text Tokens

In [50]:
def load_embeddings(filename):
    """
    Loads embeddings from embedding file and creates 
    1) dictionary of embedding words to indices
    2) list of embedding indices to words
    3) dense word embedding matrix
    """
    embeddings = KeyedVectors.load_word2vec_format(filename, binary=False, no_header=True)
    vocab2indx = dict(embeddings.key_to_index)
    idx2vocab = list(embeddings.index_to_key)
    embed_array = embeddings.vectors # matrix of dense word embeddings 
                                     # rows: a word 
                                     # columns: dimensions (50) of the dense embeddings
    return vocab2indx, idx2vocab, embed_array


def add_the_embedding(embed_array, vocab2indx): 
    """
    Adds "the" embedding to the embed_array matrix
    """
    the_embedding = embed_array[vocab2indx["the"]]
    out = np.vstack((embed_array, the_embedding))
    return out


def add_oov(idx2vocab, vocab2indx, embed_array):
    """
    Adds <OOV> token to embedded vocabulary
    """
    print("len embed array: ", len(embed_array))
    new_oov_entry = len(embed_array)
    idx2vocab += ["<OOV>"]
    vocab2indx["<OOV>"] = new_oov_entry
    embed_array_w_oov = add_the_embedding(embed_array, vocab2indx)

    return idx2vocab, vocab2indx, embed_array_w_oov


def add_pad(idx2vocab, vocab2indx, embed_array):
    """
    Adds <PAD> token to embedded vocabulary
    """
    print("len embed array: ", len(embed_array))
    new_pad_entry = len(embed_array)
    idx2vocab += ["<PAD>"]
    vocab2indx["<PAD>"] = new_pad_entry
    embed_array_w_pad = add_the_embedding(embed_array, vocab2indx)
    
    return idx2vocab, vocab2indx, embed_array_w_pad


def truncate(original_indices_list: list, maximum_length=100) -> list: 
    """
    Truncates the original_indices_list to the maximum_length
    """
    return original_indices_list[0:maximum_length]


def pad(original_indices_list: list, pad_index: int, maximum_length=100) -> list: 
    """
    Given original_indices_list, concatenates the pad_index enough times 
    to make the list to maximum_length. 
    """
    while len(original_indices_list) < maximum_length:
        original_indices_list.append(pad_index)
        
    return original_indices_list


def get_padded_oov_embeddings():
    """
    Get embedding array which includes the <PAD> and <OOV> tokens
    """
    vocab2indx, idx2vocab, embed_array = load_embeddings("glove.twitter.27B.100d.txt")
    idx2vocab, vocab2indx, embed_array_w_oov = add_oov(idx2vocab, vocab2indx, embed_array)
    idx2vocab, vocab2indx, embed_array_w_oov_pad = add_pad(idx2vocab, vocab2indx, embed_array_w_oov)
    
    return embed_array_w_oov_pad, vocab2indx, idx2vocab

def create_word_indices(tokens, vocab2indx): 
    """
    For each example, translate each token into its corresponding index from vocab2indx
    
    Replace words not in the vocabulary with the symbol "<OOV>" 
        which stands for 'out of vocabulary'
        
    Arguments: 
       - tokens (List[str]): list of strings of tokens 
       - vocab2indx (dict): each vocabulary word as strings and its corresponding int index 
                           for the embeddings 
                           
    Returns: 
        - (List[int]): list of integers
    """
    indices = []

    for token in tokens:
        if token not in vocab2indx:
            token = "<OOV>"
        indices.append(vocab2indx[token])
    
    return indices


def convert_X(Xmat, embeddings, vocab2indx, idx2vocab):
    MAXIMUM_LENGTH = 128
    
    X_list_embedded = []
    X_list_tokenized = []
    for one_train_example in Xmat:
        one_train_example = str(one_train_example)
        one_train_example_tokenized = tokenizer(one_train_example)
        X_list_tokenized.append(one_train_example_tokenized)
        one_train_indices = create_word_indices(one_train_example, vocab2indx)
        one_train_indices = truncate(one_train_indices, maximum_length=MAXIMUM_LENGTH)
        one_train_indices = pad(one_train_indices, len(vocab2indx)-1, maximum_length=MAXIMUM_LENGTH)
        
        one_train_example_embeddings = [] # A list of token embeddings
        
        for index in one_train_indices:
            one_train_example_embeddings.append(embeddings[index])
        
        X_list_embedded.append(one_train_example_embeddings)
        
    return X_list_tokenized, X_list_embedded

Adding Embeddings and Tokenized Text to Train and Test Dataframes

In [57]:
embeddings, vocab2indx, idx2vocab = get_padded_oov_embeddings()

X_train_tokens, X_train_embedded = convert_X(train["text"], embeddings, vocab2indx, idx2vocab)
X_test_tokens, X_test_embedded = convert_X(test["text"], embeddings, vocab2indx, idx2vocab)

train["Embedding"] = X_train_embedded
train["Tokenized Text"] = X_train_tokens
test["Embedding"] = X_test_embedded
test["Tokenized Text"] = X_test_tokens

len embed array:  1193514
len embed array:  1193515


In [59]:
train[0:5]

Unnamed: 0.1,Unnamed: 0,id,text,label,label_text,Y_hard,Embedding,Tokenized Text
0,0,cb774db0d1,"I`d have responded, if I were going",1,neutral,"[0.0, 1.0, 0.0]","[[0.095152, 0.37024, 0.54291, 0.19621, 0.04820...","[I, `d, have, responded, ,, if, I, were, going]"
1,1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative,"[1.0, 0.0, 0.0]","[[0.095152, 0.37024, 0.54291, 0.19621, 0.04820...","[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,2,088c60f138,my boss is bullying me...,0,negative,"[1.0, 0.0, 0.0]","[[0.085651, -0.014665, -0.20531, -0.13928, -0....","[my, boss, is, bullying, me, ...]"
3,3,9642c003ef,what interview! leave me alone,0,negative,"[1.0, 0.0, 0.0]","[[0.095152, 0.37024, 0.54291, 0.19621, 0.04820...","[what, interview, !, leave, me, alone]"
4,4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative,"[1.0, 0.0, 0.0]","[[0.095152, 0.37024, 0.54291, 0.19621, 0.04820...","[Sons, of, ****,, why, couldn, `t, they, put, ..."


In [58]:
train.to_csv("train_no_soft.csv",index=False)
test.to_csv("test.csv",index=False)

Outdated functions and Unit Tests:

In [22]:
def add_column_to_csv(data, header, output_file: str) -> None:
    data = {header: list(data)}
    df = pd.DataFrame(data)
    file = pd.read_csv(output_file)
    out = file.merge(df, how='outer', left_index=True,right_index=True)
    with open(output_file, 'w') as f:
        out.to_csv(f, index=False)

In [23]:
def create_csv_with_labels(labels, header, output_file: str) -> None:
    data = {header: list(labels)}
    df = pd.DataFrame(data)
    df.to_csv(output_file)

In [160]:
# UNIT TEST for create_csv_with_labels

soft = np.array([np.array([.1,.8,.1]), np.array([.2,.7,.1]), np.array([.3,.4,.3])])
hard = np.array([np.array([1,0,0]),np.array([0,1,0]),np.array([0,0,1])])
create_csv_with_labels(soft, "Y_soft", "train.csv")
add_hard_labels_to_csv(hard, "train.csv")

      Y_hard
0  [1, 0, 0]
1  [0, 1, 0]
2  [0, 0, 1]


In [None]:
# UNIT TEST encode_labels_one_hot

Y = Y_train[0:5]
print(Y)
new_Y = encode_labels_one_hot(Y)
print(new_Y)

Train-Dev-Test Split

In [119]:
train = pd.read_csv("train_no_soft.csv")
test = pd.read_csv("test.csv")
train, dev = train_test_split(train, test_size=0.3, random_state=42)

Generating Soft Labels from the Teacher Model

In [127]:
def get_softlabels(X):
    """
    Citation: Full Classification Example on twitter-roberta-base-sentiment-latest Model card. 
    Link: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latesthttps://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest
    
    Runs Twitter-Roberta-Based-Sentiment Model on the Twitter Sentiment Extraction dataset
    Returns an array of soft labels (log probabilities)
    """
    # Preprocess text (username and link placeholders)
    def preprocess(text):
        text = str(text)
        new_text = []
        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    
    # Pytorch
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # dimensions: [# examples, # classes (3)]
    soft_labels = np.zeros((X.shape[0],3))
    
    for index, row in enumerate(X):
        text = preprocess(row)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        soft_labels[index] = scores.tolist()
        index += 1

    return soft_labels

Adding Soft Labels to Train Dataframe

In [125]:
# UNIT TEST, get_softlabels()
mini_train = train[0:5]
print(mini_train.shape)

mini_train["softies"] = get_softlabels(mini_train["text"]).tolist()
mini_train.to_csv("softies.csv", index=False)

(5, 8)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(5,)
INDEX:  0
INDEX:  1
INDEX:  2
INDEX:  3
INDEX:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_train["softies"] = get_softlabels(mini_train["text"]).tolist()


In [128]:
print(train.shape)
start = time.time()
train["Y_soft"] = get_softlabels(train["text"]).tolist()
end = time.time()
print("Total time: ",end-start)

train.to_csv("train.csv", index=False)

(19236, 8)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total time:  1947.9866590499878


KeyboardInterrupt: 

In [129]:
dev.to_csv("dev.csv", index=False)

In [130]:
train.shape

(19236, 9)

In [131]:
dev.shape

(8245, 8)

In [132]:
test.shape

(3534, 8)