## Step 1: import packages and load Twitter dataset

In [1]:
from fairseq.models.roberta import XLMRModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score

dataset_path = "./data/twitter/Tweets.csv"
df = pd.read_csv(dataset_path,sep=',')
display(df)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0000,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0000,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.6340,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
8,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0000,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


## Step 2: Build XLM-R model for sentiment prediction, some codes are from https://github.com/mukhal/xlm-roberta-ner

In [2]:
class XLMRClassification(nn.Module):
    def __init__(self, pretrained_path, n_labels, hidden_size, max_length, head_init_range=0.04, device='cuda'):
        super().__init__()

        self.n_labels = n_labels
        self.classification_head = nn.Linear(hidden_size*max_length, n_labels)
        self.xlmr = XLMRModel.from_pretrained(pretrained_path)
        self.model = self.xlmr.model
        self.device=device
        self.loss = nn.CrossEntropyLoss()

        # initializing classification head
        self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)

    def forward(self, inputs_ids, labels):
        '''
        Computes a forward pass through the sequence tagging model.
        Args:
            inputs_ids: tensor of size (bsz, max_seq_len). padding idx = 1
            labels: tensor of size (bsz, n_labels)
        Returns :
            logits: unnormalized model outputs.
            loss: Cross Entropy loss between labels and logits
        '''
        transformer_out, _ = self.model(inputs_ids, features_only=True)
        logits = self.classification_head(torch.reshape(transformer_out,(-1,hidden_size*max_length)))
        loss = self.loss(logits,labels)
        return logits,loss
    
    def encode_word(self, s):
        """
        takes a string and returns a list of token ids
        """
        tensor_ids = self.xlmr.encode(s)
        return tensor_ids.cpu().numpy().tolist()

## Step 3: Preprocess dataset

In [3]:
# set some hyper-parameters
device = 'cpu'
hidden_size = 768
num_labels = 3
epoch = 5
lr = 1e-3
batch_size = 32
max_length = 64
num_workers = 1

# cover raw label into one-hot label
raw_labels = df['airline_sentiment'].tolist()
label_encoder = preprocessing.LabelEncoder()
new_labels = label_encoder.fit(raw_labels).transform(raw_labels)
onehot_encoder = preprocessing.OneHotEncoder(categories='auto')
onehot_labels = onehot_encoder.fit(np.array(new_labels).reshape(-1,1)).transform(np.array(new_labels).reshape(-1,1)).toarray()

# extract raw sentences
sentences = df['text'].tolist()

# dataset split, we use 2/3 data for training and the rest for testing
X_train, X_test, y_train, y_test = train_test_split(sentences, onehot_labels, test_size=0.33, random_state=42)

## Step 4: load the XLM Roberta base pretrained language model and keep all the parameters of the pretrained language model frozen

In [4]:
model = XLMRClassification(pretrained_path="./pretrained_models/xlmr.base/",
                                       n_labels=num_labels, hidden_size=hidden_size, max_length=max_length, device=device)
model.to(device)
params = list(model.named_parameters())
for n, p in model.named_parameters():
    if 'xlmr' in n and p.requires_grad:
        p.requires_grad = False
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Step 5: Construct Dataloader for training && test

In [5]:
class TwitterData(Dataset):
    def __init__(self, data, label, model, max_length):
        self.data = data
        self.label = label
        self.model = model
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        feat = self.convert_sentence_to_feats(self.data[index])
        label = self.label[index]
        return feat,label
    
    def convert_sentence_to_feats(self,s):
        feat = self.model.encode_word(s)
        while(len(feat)<self.max_length):
            feat.append(1)
        if(len(feat)>self.max_length):
            feat = feat[0:self.max_length]
        return torch.tensor(feat)
    
TrainDataSet = TwitterData(X_train,y_train,model,max_length)
TrainDataLoader = DataLoader(dataset=TrainDataSet,batch_size=batch_size,num_workers=num_workers)

TestDataSet  = TwitterData(X_test,y_test,model,max_length)
TestDataLoader = DataLoader(dataset=TestDataSet,batch_size=batch_size,num_workers=num_workers)

## Step 6: Fine-tune the model and test the results

In [6]:
from sklearn.metrics import accuracy_score
for i in range(epoch):
    pbar = tqdm(TrainDataLoader)
    pbar.set_description("[Epoch {} train]".format(i+1))
    for input_ids,labels in pbar:
        logits,loss = model(input_ids,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        accuracy = accuracy_score(np.argmax(labels.numpy(),axis=1),np.argmax(logits.detach().numpy(),axis=1))
        pbar.set_postfix(loss=loss.item(),accuracy=accuracy)

pbar = tqdm(TestDataLoader)
pbar.set_description("[Test]")
test_loss = 0.0
test_accuracy = 0.0
for input_ids,labels in pbar:
    logits,loss = model(input_ids,labels)
    accuracy = accuracy_score(np.argmax(labels.numpy(),axis=1),np.argmax(logits.detach().numpy(),axis=1))
    pbar.set_postfix(loss=loss.item(),accuracy=accuracy)
    test_loss += loss.item()/len(TestDataLoader)
    test_accuracy += accuracy/len(TestDataLoader)
print('test loss = {:.3f} test accuracy = {:.3f}'.format(test_loss, test_accuracy))

HBox(children=(IntProgress(value=0, max=307), HTML(value='')))




HBox(children=(IntProgress(value=0, max=307), HTML(value='')))




HBox(children=(IntProgress(value=0, max=307), HTML(value='')))




HBox(children=(IntProgress(value=0, max=307), HTML(value='')))




HBox(children=(IntProgress(value=0, max=307), HTML(value='')))




HBox(children=(IntProgress(value=0, max=151), HTML(value='')))


test loss = 0.877 test accuracy = 0.742
