In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

In [6]:
test_csv = pd.read_csv('test_data.csv')
train_csv = pd.read_csv('train_data.csv')
train_csv = train_csv.sort_values('sentiment')
train_csv = train_csv.drop(train_csv[train_csv.sentiment == 'positive'].index[int(train_csv.count()['sentiment']/5.5):])
train_df = train_csv[['review_content','sentiment']]
test_df = test_csv[['review_content','Annotator_1']]
test_df = test_df.rename(columns={'Annotator_1':'sentiment'})
train_df = train_df.dropna()
print(train_df)
print(test_df)

#Changing labels to numbers
encode_lb = {'negative':0, 'neutral':1, 'positive':2}
train_df['sentiment_lb'] = train_df['sentiment'].map(encode_lb)
test_df['sentiment_lb'] = test_df['sentiment'].map(encode_lb)
train_df['review_content'] = train_df['review_content'].apply(lambda x:" ".join(x.split()[:512]))
test_df['review_content'] = test_df['review_content'].apply(lambda x:" ".join(x.split()[:512]))
train_df.to_csv('roberta_train_processed.csv',index=False)
test_df.to_csv('roberta_test_processed.csv',index=False)

                                          review_content sentiment
0      I feel like I received a different product tha...  negative
6867   We won't get a chance to play the game. I'm re...  negative
6866   this toy looks great,  it works great,  but th...  negative
28036  Not the item that is advertised. I ordered twi...  negative
6863   The box of 40 colors only had 30 colors; 10 co...  negative
...                                                  ...       ...
32731  Although I cannot believe how much they charge...  positive
32743             My 3 year old son loves these animals!  positive
32729  Zero complaints. This is the uno I remember as...  positive
32757  Melissa & Doug stamps are quality stamps geare...  positive
32760  Very impressive figure with extremely nice pac...  positive

[14849 rows x 2 columns]
                                         review_content sentiment
0                                   Exactly as expected  positive
1     The kids with special needs love

In [20]:
data_path = './roberta/data'
output_path = './roberta/output'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 
# Set tokenizer hyperparameters.
MAX_SEQ_LEN = 256
BATCH_SIZE = 16
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Define columns to read.
label_field = Field(sequential=False, use_vocab=False, batch_first=True)
text_field = Field(use_vocab=False, 
                   tokenize=tokenizer.encode, 
                   include_lengths=False, 
                   batch_first=True,
                   truncate_first=True,
                   fix_length=MAX_SEQ_LEN, 
                   pad_token=PAD_INDEX, 
                   unk_token=UNK_INDEX)

fields = {'review_content' : ('review_content', text_field), 'sentiment_lb' : ('sentiment_lb', label_field)}


# Read preprocessed CSV into TabularDataset and split it into train, test and valid.
train_data, valid_data = TabularDataset(path=f"roberta_train_processed.csv", format='CSV', fields=fields, 
                        skip_header=False).split(split_ratio=[0.8, 0.2], stratified=True, strata_field='sentiment_lb')
test_data = TabularDataset(path=f"roberta_test_processed.csv", format='CSV', fields=fields, skip_header=False)

# Create train and validation iterators.
train_iter, valid_iter = BucketIterator.splits((train_data, valid_data),
                                               batch_size=BATCH_SIZE,
                                               device=device,
                                               shuffle=True,
                                               sort_key=lambda x: len(x.review_content), 
                                               sort=True, 
                                               sort_within_batch=False)

# Test iterator, no shuffling or sorting required.
test_iter = Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (679 > 512). Running this sequence through the model will result in indexing errors


In [21]:
# Functions for saving and loading model parameters and metrics.
def save_checkpoint(path, model, valid_loss):
    torch.save({'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}, path)

    
def load_checkpoint(path, model):    
    state_dict = torch.load(path, map_location=device)
    model.load_state_dict(state_dict['model_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(path, train_loss_list, valid_loss_list, global_steps_list):   
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, path)

In [22]:
# Model with extra layers on top of RoBERTa
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 3) #changed output to 3 since we have 3 classes to predict
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        
        return x  

In [23]:
#Training classifier for feature extraction
def pretrain(model, 
             optimizer, 
             train_iter, 
             valid_iter, 
             scheduler,
             num_epochs,
             valid_period = len(train_iter)):
    
    # Pretrain linear layers, do not train bert
    for param in model.roberta.parameters():
        param.requires_grad = False
    
    model.train()
    
    # Initialize losses and loss histories
    train_loss = 0.0
    valid_loss = 0.0   
    global_step = 0  
    
    # Train loop
    for epoch in range(num_epochs):
        for (source, target), _ in train_iter:
            mask = (source != PAD_INDEX).type(torch.uint8)
            
            y_pred = model(input_ids=source,  
                           attention_mask=mask)
            
            loss = torch.nn.CrossEntropyLoss()(y_pred, target)
   
            loss.backward()
            
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
                
            optimizer.zero_grad()
            
            # Update train loss and global step
            train_loss += loss.item()
            global_step += 1

            # Validation loop. Save progress and evaluate model performance.
            if global_step % valid_period == 0:
                model.eval()
                
                with torch.no_grad():                    
                    for (source, target), _ in valid_iter:
                        mask = (source != PAD_INDEX).type(torch.uint8)
                        
                        y_pred = model(input_ids=source, 
                                       attention_mask=mask)
                        
                        loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        
                        valid_loss += loss.item()

                # Store train and validation loss history
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                
                model.train()

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], PT Loss: {:.4f}, Val Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))
                
                train_loss = 0.0                
                valid_loss = 0.0
    
    # Set bert parameters back to trainable
    for param in model.roberta.parameters():
        param.requires_grad = True
        
    print('Pre-training done!')

In [24]:
#Training classifier for fine-tuning
def train(model, optimizer, train_iter, valid_iter, output_path, scheduler, num_epochs, valid_period = len(train_iter)):
    
    # Initialize losses and loss histories
    train_loss = 0.0
    valid_loss = 0.0
    train_loss_list = []
    valid_loss_list = []
    best_valid_loss = float('Inf')
    
    global_step = 0
    global_steps_list = []
    
    model.train()
    
    # Train loop
    for epoch in range(num_epochs):
        for (source, target), _ in train_iter:
            mask = (source != PAD_INDEX).type(torch.uint8)

            y_pred = model(input_ids=source,  
                           attention_mask=mask)
            loss = torch.nn.CrossEntropyLoss()(y_pred, target)
            loss.backward()

            
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
                
            optimizer.zero_grad()
            
            # Update train loss and global step
            train_loss += loss.item()
            global_step += 1

            # Validation loop. Save progress and evaluate model performance.
            if global_step % valid_period == 0:
                model.eval()
                
                with torch.no_grad():                    
                    for (source, target), _ in valid_iter:
                        mask = (source != PAD_INDEX).type(torch.uint8)

                        y_pred = model(input_ids=source, 
                                       attention_mask=mask)
                        
                        loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        valid_loss += loss.item()

                # Store train and validation loss history
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                train_loss_list.append(train_loss)
                valid_loss_list.append(valid_loss)
                global_steps_list.append(global_step)

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))
                
                # checkpoint
                if best_valid_loss > valid_loss:
                    best_valid_loss = valid_loss
                    save_checkpoint(output_path + '/RoBERTa.pkl', model, best_valid_loss)
                    save_metrics(output_path + '/metric.pkl', train_loss_list, valid_loss_list, global_steps_list)
                        
                train_loss = 0.0                
                valid_loss = 0.0
                model.train()
    
    save_metrics(output_path + '/metric.pkl', train_loss_list, valid_loss_list, global_steps_list)
    print('Training done!')

In [25]:
# Main training loop
NUM_EPOCHS = 6
steps_per_epoch = len(train_iter)

model = ROBERTAClassifier(0.4)
model = model.to(device)


optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*1, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)

print("======================= Start pretraining ==============================")

pretrain(model,optimizer,train_iter,valid_iter,scheduler,NUM_EPOCHS)

NUM_EPOCHS = 12
print("======================= Start training =================================")
optimizer = AdamW(model.parameters(), lr=2e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*2, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)

train(model, optimizer,train_iter, valid_iter, output_path, scheduler, NUM_EPOCHS)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/6], global step [743/4458], PT Loss: 1.0932, Val Loss: 1.0546
Epoch [2/6], global step [1486/4458], PT Loss: 1.0486, Val Loss: 1.0463
Epoch [3/6], global step [2229/4458], PT Loss: 1.0412, Val Loss: 1.0430
Epoch [4/6], global step [2972/4458], PT Loss: 1.0353, Val Loss: 1.0384
Epoch [5/6], global step [3715/4458], PT Loss: 1.0335, Val Loss: 1.0326
Epoch [6/6], global step [4458/4458], PT Loss: 1.0361, Val Loss: 1.0275
Pre-training done!
Epoch [1/12], global step [743/8916], Train Loss: 0.9268, Valid Loss: 0.5672
Epoch [2/12], global step [1486/8916], Train Loss: 0.5584, Valid Loss: 0.4899
Epoch [4/12], global step [2972/8916], Train Loss: 0.4622, Valid Loss: 0.4606
Epoch [5/12], global step [3715/8916], Train Loss: 0.4465, Valid Loss: 0.4590
Epoch [6/12], global step [4458/8916], Train Loss: 0.4281, Valid Loss: 0.4619
Epoch [7/12], global step [5201/8916], Train Loss: 0.4065, Valid Loss: 0.4676
Epoch [8/12], global step [5944/8916], Train Loss: 0.4007, Valid Loss: 0.4676
Epoch

In [26]:
# Evaluation Function
def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (source, target), _ in test_loader:
                mask = (source != PAD_INDEX).type(torch.uint8)
                
                output = model(source, attention_mask=mask)

                y_pred.extend(torch.argmax(output, axis=-1).tolist())
                y_true.extend(target.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[0,1,2], digits=4))


In [27]:
#Evaluation
model = ROBERTAClassifier()
model = model.to(device)

load_checkpoint(output_path + '/RoBERTa.pkl', model)

evaluate(model, test_iter)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Classification Report:
              precision    recall  f1-score   support

           0     0.8568    0.8797    0.8681       748
           1     0.5357    0.5636    0.5493       346
           2     0.9642    0.9500    0.9571      2582

    accuracy                         0.8993      3676
   macro avg     0.7856    0.7978    0.7915      3676
weighted avg     0.9020    0.8993    0.9006      3676

