In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import sklearn
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from scipy.special import softmax
import time
import nltk

In [2]:
def load_dataset():
    """
    Load training and test sets
    """

    train = pd.read_csv("./new_dataset/train.csv")
    val = pd.read_csv("./new_dataset/val.csv")
    test = pd.read_csv("./new_dataset/test.csv")
    
    return train, val, test

In [11]:
train, val, test = load_dataset()

In [12]:
# Define the mapping from label to sentiment text
label_to_sentiment = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Apply the mapping to the 'label' column
train['sentiment'] = train['label'].map(label_to_sentiment)
val['sentiment'] = val['label'].map(label_to_sentiment)
test['sentiment'] = test['label'].map(label_to_sentiment)

In [13]:
# Count the number of positive, neutral, and negative sentiments
sentiment_counts = train['sentiment'].value_counts()
print(sentiment_counts)

sentiment
neutral     20673
positive    17849
negative     7093
Name: count, dtype: int64


In [15]:
def encode_labels_one_hot(Y):
    """
    For each row in input Y, converts labels 0, 1, and 2 to arrays
    that are one-hot encoded
    """
    encoded_Y = np.zeros((Y.shape[0],3))
    
    for index, row in enumerate(Y):
        one_hot_array = np.zeros(3)
        one_hot_array[row] = 1
        encoded_Y[index] = one_hot_array
    
    return encoded_Y

In [16]:
train["Y_hard"] = encode_labels_one_hot(train["label"]).tolist()
val["Y_hard"] = encode_labels_one_hot(val["label"]).tolist()
test["Y_hard"] = encode_labels_one_hot(test["label"]).tolist()

In [19]:
# Save val and test
val.to_csv("./new_dataset/val_preprocessed.csv", index=False)
test.to_csv("./new_dataset/test_preprocessed.csv", index=False)

In [22]:
def get_softlabels(X):
    """
    Citation: Full Classification Example on twitter-roberta-base-sentiment-latest Model card. 
    Link: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latesthttps://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest
    
    Runs Twitter-Roberta-Based-Sentiment Model on the Twitter Sentiment Extraction dataset
    Returns an array of soft labels (log probabilities)
    """
    # Preprocess text (username and link placeholders)
    def preprocess(text):
        text = str(text)
        new_text = []
        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    
    # Pytorch
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # dimensions: [# examples, # classes (3)]
    soft_labels = np.zeros((X.shape[0],3))
    
    for index, row in enumerate(X):
        if index % 1000 == 0:
            print("Current index: ", index)
        text = preprocess(row)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        soft_labels[index] = scores.tolist()
        index += 1

    return soft_labels

In [23]:
train["Y_soft"] = get_softlabels(train["text"]).tolist()

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Current index:  0
Current index:  1000
Current index:  2000
Current index:  3000
Current index:  4000
Current index:  5000
Current index:  6000
Current index:  7000
Current index:  8000
Current index:  9000
Current index:  10000
Current index:  11000
Current index:  12000
Current index:  13000
Current index:  14000
Current index:  15000
Current index:  16000
Current index:  17000
Current index:  18000
Current index:  19000
Current index:  20000
Current index:  21000
Current index:  22000
Current index:  23000
Current index:  24000
Current index:  25000
Current index:  26000
Current index:  27000
Current index:  28000
Current index:  29000
Current index:  30000
Current index:  31000
Current index:  32000
Current index:  33000
Current index:  34000
Current index:  35000
Current index:  36000
Current index:  37000
Current index:  38000
Current index:  39000
Current index:  40000
Current index:  41000
Current index:  42000
Current index:  43000
Current index:  44000
Current index:  45000


In [24]:
train.to_csv("./new_dataset/train_preprocessed.csv", index=False)

# Accuracy of the teacher on new dataset (the one teacher is trained on):

In [26]:
teacher_pred = train[["label", "Y_soft"]]

In [27]:
def compare_max_index(df):
    correct_count = 0
    for idx, row in df.iterrows():
        Y_soft = np.array(row['Y_soft'])
        label = row['label']
        if np.argmax(Y_soft) == label:
            correct_count += 1
    return correct_count

In [28]:
correct_count = compare_max_index(teacher_pred)

In [29]:
correct_count / teacher_pred.shape[0]

0.7772662501370163