In [1]:
# Pre-processing

In [158]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from scipy.special import softmax
import time

Generating Soft Labels from the Teacher Model

In [28]:
def load_dataset():
    """
    Load training and test sets
    """

    train = pd.read_csv("./dataset_twitter/train.csv")
    test = pd.read_csv("./dataset_twitter/test.csv")

    Xmat = train["text"]
    Y = train["label"]

    Xmat_test = test["text"]
    Y_test = test["label"]
    
    return Xmat, Y, Xmat_test, Y_test

In [174]:
def get_softlabels(X):
    """
    Citation: Full Classification Example on twitter-roberta-base-sentiment-latest Model card. 
    Link: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latesthttps://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest
    
    Runs Twitter-Roberta-Based-Sentiment Model on the Twitter Sentiment Extraction dataset
    Returns an array of soft labels (log probabilities)
    """
    # Preprocess text (username and link placeholders)
    def preprocess(text):
        text = str(text)
        new_text = []
        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    
    # Pytorch
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # dimensions: [# examples, # classes (3)]
    soft_labels = np.zeros((X.shape[0],3))
    
    for index, row in enumerate(X):
        text = preprocess(row)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        soft_labels[index] = scores
    
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

    create_csv_with_labels(soft_labels, "Y_soft", "train.csv")
    return soft_labels

Encoding Hard Labels and Adding to Train, Dev, and Test CSVs

In [162]:
def encode_labels_one_hot(Y):
    """
    For each row in input Y, converts labels 0, 1, and 2 to arrays
    that are one-hot encoded
    """
    encoded_Y = np.zeros((Y.shape[0],3))
    
    for index, row in enumerate(Y):
        one_hot_array = np.zeros(3)
        one_hot_array[row] = 1
        encoded_Y[index] = one_hot_array
    
    return encoded_Y

In [163]:
def add_column_to_csv(data, header, output_file: str) -> None:
    data = {"Y_hard": list(data)}
    df = pd.DataFrame(data)
    file = pd.read_csv(output_file)
    out = file.merge(df, how='outer', left_index=True,right_index=True)
    with open(output_file, 'w') as f:
        out.to_csv(f, index=False)

In [164]:
def create_csv_with_labels(labels, header, output_file: str) -> None:
    data = {header: list(labels)}
    df = pd.DataFrame(data)
    df.to_csv(output_file)

In [160]:
# UNIT TEST for create_csv_with_labels

soft = np.array([np.array([.1,.8,.1]), np.array([.2,.7,.1]), np.array([.3,.4,.3])])
hard = np.array([np.array([1,0,0]),np.array([0,1,0]),np.array([0,0,1])])
create_csv_with_labels(soft, "Y_soft", "train.csv")
add_hard_labels_to_csv(hard, "train.csv")

      Y_hard
0  [1, 0, 0]
1  [0, 1, 0]
2  [0, 0, 1]


Train-Dev-Test Split of Dataset, Creating CSVs with Labels

In [165]:
Xmat, Y, Xmat_test, Y_test = load_dataset()
Xmat_train, Xmat_dev, Y_train, Y_dev = train_test_split(Xmat, Y, test_size=0.3, random_state=42)

In [175]:
# UNIT TEST get_softlabels(X)

X = Xmat_train[4000:4200]
start = time.time()
Xmat_train_soft = get_softlabels(X)
end = time.time()

print("Total time: ", end - start)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.6593
2) neutral 0.29
3) positive 0.0506
Total time:  20.433573007583618


In [None]:
# UNIT TEST encode_labels_one_hot

Y = Y_train[0:5]
print(Y)
new_Y = encode_labels_one_hot(Y)
print(new_Y)

In [176]:
# Add soft labels to train.csv
start = time.time()
Xmat_train_soft = get_softlabels(Xmat_train)
end = time.time()

print("Total time: ", end - start)

# Add hard labels to train.csv
Y_train = encode_labels_one_hot(Y_train)
add_column_to_csv(Y_train, "Y_hard", "train.csv")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.9289
2) neutral 0.063
3) positive 0.0081
Total time:  1905.9214930534363
                Y_hard
0      [0.0, 0.0, 1.0]
1      [1.0, 0.0, 0.0]
2      [0.0, 0.0, 1.0]
3      [0.0, 1.0, 0.0]
4      [0.0, 1.0, 0.0]
...                ...
19231  [0.0, 0.0, 1.0]
19232  [0.0, 1.0, 0.0]
19233  [0.0, 1.0, 0.0]
19234  [1.0, 0.0, 0.0]
19235  [1.0, 0.0, 0.0]

[19236 rows x 1 columns]


In [166]:
# Encode Y labels into arrays of size 3 (for each sentiment label output)
# Use one-hot encode
# Add to CSVs or create CSVs (if not for training dataset)
Y_dev = encode_labels_one_hot(Y_dev)
Y_test = encode_labels_one_hot(Y_test)
Y_train = encode_labels_one_hot(Y_train)

add_column_to_csv(Y_train, "Y_hard", "train.csv")
create_csv_with_labels(Y_test, "Y", "test.csv")
create_csv_with_labels(Y_dev, "Y", "dev.csv")

Tokenizing Function

In [None]:
def tokenizer(text: str):
    '''
    Citation: This tokenizer function & regex rule is borrowed from Katie's tokenizer regex demo at:
    https://www.cs.williams.edu/~kkeith/teaching/s23/cs375/attach/tokenization_regex_demo.html
    This helper function takes a string and returns a list of tokenized strings.
    '''
    regex = r"[A-Za-z]+|\$[\d\.]+|\S+" 
    res = nltk.regexp_tokenize(text, regex)
    return [i for i in res if i != "."]

Embedding Functions (including Truncation and Padding)

In [None]:
def load_embeddings(filename):
    """
    Loads embeddings from embedding file and creates 
    1) dictionary of embedding words to indices
    2) list of embedding indices to words
    3) dense word embedding matrix
    """
    embeddings = KeyedVectors.load_word2vec_format(filename, binary=False)
    vocab2indx = dict(embeddings.key_to_index)
    idx2vocab = list(embeddings.index_to_key)
    embed_array = embeddings.vectors # matrix of dense word embeddings 
                                     # rows: a word 
                                     # columns: dimensions (50) of the dense embeddings
    return vocab2indx, idx2vocab, embed_array


def add_the_embedding(embed_array, vocab2indx): 
    """
    Adds "the" embedding to the embed_array matrix
    """
    the_embedding = embed_array[vocab2indx["the"]]
    out = np.vstack((embed_array, the_embedding))
    return out


def add_oov(idx2vocab, vocab2indx, embed_array):
    """
    Adds <OOV> token to embedded vocabulary
    """
    print("len embed array: ", len(embed_array))
    new_oov_entry = len(embed_array)
    idx2vocab += ["<OOV>"]
    vocab2indx["<OOV>"] = new_oov_entry
    embed_array_w_oov = add_the_embedding(embed_array, vocab2indx)

    return idx2vocab, vocab2indx, embed_array_w_oov


def add_pad(idx2vocab, vocab2indx, embed_array):
    """
    Adds <PAD> token to embedded vocabulary
    """
    print("len embed array: ", len(embed_array))
    new_pad_entry = len(embed_array)
    idx2vocab += ["<PAD>"]
    vocab2indx["<PAD>"] = new_pad_entry
    embed_array_w_pad = add_the_embedding(embed_array, vocab2indx)
    
    return idx2vocab, vocab2indx, embed_array_w_pad


def truncate(original_indices_list: list, maximum_length=100) -> list: 
    """
    Truncates the original_indices_list to the maximum_length
    """
    return original_indices_list[0:maximum_length]


def pad(original_indices_list: list, pad_index: int, maximum_length=100) -> list: 
    """
    Given original_indices_list, concatenates the pad_index enough times 
    to make the list to maximum_length. 
    """
    while len(original_indices_list) < maximum_length:
        original_indices_list.append(pad_index)
        
    return original_indices_list


def get_padded_oov_embeddings():
    """
    Get embedding array which includes the <PAD> and <OOV> tokens
    """
    vocab2indx, idx2vocab, embed_array = load_embeddings("glove.twitter.27B.100d.txt")
    idx2vocab, vocab2indx, embed_array_w_oov = add_oov(idx2vocab, vocab2indx, embed_array)
    idx2vocab, vocab2indx, embed_array_w_oov_pad = add_pad(idx2vocab, vocab2indx, embed_array_w_oov)
    
    return embed_array_w_oov_pad, vocab2indx, idx2vocab

def create_word_indices(tokens, vocab2indx): 
    """
    For each example, translate each token into its corresponding index from vocab2indx
    
    Replace words not in the vocabulary with the symbol "<OOV>" 
        which stands for 'out of vocabulary'
        
    Arguments: 
       - tokens (List[str]): list of strings of tokens 
       - vocab2indx (dict): each vocabulary word as strings and its corresponding int index 
                           for the embeddings 
                           
    Returns: 
        - (List[int]): list of integers
    """
    indices = []

    for token in tokens:
        if token not in vocab2indx:
            token = "<OOV>"
        indices.append(vocab2indx[token])
    
    return indices


def convert_X(Xmat):
    MAXIMUM_LENGTH = 128
    embeddings, vocab2indx, idx2vocab = get_padded_oov_embeddings()
    
    X_list = []
    for one_train_example in Xmat:
        one_train_example = tokenizer(one_train_example)
        one_train_indices = create_word_indices(one_train_example, vocab2indx)
        one_train_indices = truncate(one_train_indices, maximum_length=MAXIMUM_LENGTH)
        one_train_indices = pad(one_train_indices, len(vocab2indx)-1, maximum_length=MAXIMUM_LENGTH)
        
        one_train_example_embeddings = [] # A list of token embeddings
        
        for index in one_train_indices:
            one_train_example_embeddings.append(embeddings[index])
        
        X_list.append(one_train_example_embeddings)
        
#     X = torch.FloatTensor(X_list)
    return X

In [None]:
# INTEGRATION TEST, get_padded_oov_embeddings

embeddings, vocab2indx, idx2vocab = get_padded_oov_embeddings()

In [None]:
# Add embeddings to train, dev, and test CSVs

X_train_embedded = convert_X(Xmat_train)
X_dev_embedded = convert_X(Xmat_dev)
X_test_embedded = convert_X(Xmat_test)

add_column_to_csv(X_train_embedded, "Embedding", "train.csv")
add_column_to_csv(X_dev_embedded, "Embedding", "dev.csv")
add_column_to_csv(X_test_embedded, "Embedding", "test.csv")