XLM-Roberta on multilingual text

Copy of XLM-Roberta_LL. Made to experiment with the input tokens

0. Install and import libraries

In [None]:
## install libraries

!pip install transformers
!pip install sentencepiece

In [None]:
import pandas as pd
import numpy as np
import random
import json
import regex as re

import matplotlib.pyplot as plt
import os
import sentencepiece
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import XLMRobertaConfig, XLMRobertaModel, XLMRobertaTokenizer

torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

from google.colab import drive
drive.mount('/content/drive')
LIBRARY_PATH = '/content/drive/MyDrive/NLP PROJECT/Finals/'

# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

Mounted at /content/drive


## 1. Read and prepare data for training

In [None]:
def clean_text(text, fixed_length = True, length = 256, head_len = 200, tail_len = 56):
  text = re.sub(r'http\S+', '', text)
  text = re.sub("\n|\r", " ", text)
  text = re.sub("['']", "", text)

  if(fixed_length == True):
    tokens = text.split()
    if(len(tokens) > length):
      head = tokens[:head_len]
      tail = tokens[-tail_len:]
      text = ' '.join(head+tail)

  return text


In [None]:
raw_data_path = LIBRARY_PATH + 'data/processed/'
raw_data_filename = 'train_DA.csv'
train_data = pd.read_csv(raw_data_path + raw_data_filename)


In [None]:
## EDA 

## Check distribution of missing values
train_data.head()
eda_data1 = train_data[['link_id1', 'title1', 'text1', 'meta_keywords1', 'meta_description1']].drop_duplicates()
eda_data1.columns = ['ids', 'title', 'text', 'meta_keywords', 'meta_description']
eda_data2 = train_data[['link_id2', 'title2', 'text2', 'meta_keywords2', 'meta_description2']].drop_duplicates()
eda_data2.columns = ['ids', 'title', 'text', 'meta_keywords', 'meta_description']

eda_df = pd.concat([eda_data1, eda_data2])

## Print missing or blanks
# for meta_keywords
tmp = eda_df[ ~ ((eda_df['meta_keywords'] == '['']') | (eda_df['meta_keywords'] == ''))]
print(tmp['meta_keywords'].value_counts())
# [''], ''

['']                                                                                                                                                                                                                                                             7693
[أ]                                                                                                                                                                                                                                                               243
['Schweriner Volkszeitung', 'Der Prignitzer', 'Norddeutsche Neueste Nachrichten', 'Nachrichten', 'News', 'Reportagen', 'Meldungen', 'Videos', 'Bilder']                                                                                                            31
['Nachrichten', 'Inland', 'Ausland', 'Wirtschaft', 'Sport', 'Kultur Reportage', 'Bericht', 'News', 'Tagesthemen', 'Aktuell', 'Neu', 'Neuigkeiten', 'Hintergrund', 'Hintergrund', 'Information', 'Politik', 'Innenpolit

In [None]:
## Split into train and eval

# remove na text from both the text fields
processed_data = train_data[train_data['text1'].notna()]
processed_data = processed_data[processed_data['text2'].notna()]

print("After removing NA text columns, we lose {0} rows".format(train_data.shape[0] - processed_data.shape[0]))

After removing NA text columns, we lose 95 rows


In [None]:
def merge_clean_columns(df):
    """
    Merge multiple columns into one and clean text
    """  
    df['merge1'] = df['meta_keywords1'].astype(str) + ', ' \
        + df['meta_description1'].astype(str) + ', '\
        + df['title1'].astype(str) + ', '\
        + df['text1'].astype(str)

    df['merge2'] = df['meta_keywords2'].astype(str) + ', ' \
        + df['meta_description2'].astype(str) + ', '\
        + df['title2'].astype(str) + ', '\
        + df['text2'].astype(str)

    df['merge1'] = df['merge1'].apply(lambda x: clean_text(x))
    df['merge2'] = df['merge2'].apply(lambda x: clean_text(x))

    return df

In [None]:
processed_data = merge_clean_columns(processed_data)
# split into train and development
train, dev = train_test_split(processed_data, test_size=0.1, random_state = 42)

## 2. Model on data text

In [None]:
## set parameters
max_len = 512
batch_size = 5
lr = 5e-6
weight_decay = 1e-4
num_epochs = 5

In [None]:
def get_data_loader(data, batch_size_flg = True):
  # tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
  input_ids, attention_masks, labels = [], [], []
  for idx, row in data.iterrows():
      text1, text2 = row['merge1'], row['merge2']
      encode_dict = tokenizer(text1,text2,
                                  max_length=max_len,
                                  padding='max_length',
                                  truncation=True,
                                  add_special_tokens=True
                                  )
      
      input_ids.append(encode_dict['input_ids'])
      attention_masks.append(encode_dict['attention_mask'])
      # model is used to predict all labels?? -> should we convert to only 1 label
      labels.append([float(x) for x in [row['Geography'],row['Entities'],row['Time'],row['Narrative'],row['Overall'],row['Style'],row['Tone']]])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  labels = torch.tensor(labels)

  data = TensorDataset(input_ids, attention_masks, labels)
  if(batch_size_flg):
      data_loader = DataLoader(data, batch_size=batch_size, shuffle=True, drop_last=True)
  else:
      data_loader = DataLoader(data)
  return data_loader

In [None]:
train_data_loader = get_data_loader(train)
eval_data_loader = get_data_loader(dev, False)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [None]:
class Custom_XLMRoberta(nn.Module):
    def __init__(self, model, hidden_size):
        super(Custom_XLMRoberta, self).__init__()
        self.reg_model = model
        self.fc1 = nn.Linear(hidden_size, 100)
        # self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(100, 7) ## currently processes the 7 labels that we have defined for 7 output types
        self.activation = nn.GELU()

    def forward(self, input_ids, attention_masks):
        output1 = self.reg_model(input_ids, attention_masks)[1]
        # output2 = 
        # x = self.dropout(x)
        logits1 = self.fc2(self.activation(self.fc1(output1)))
        
        return logits1


In [None]:
def predict(model, data_loader):
  model.eval()
  overall_pred, overall_true = [], []
  with torch.no_grad():
    for idx, (ids, att_msks, y) in enumerate(data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred).cpu().numpy().tolist(), torch.squeeze(y).cpu().numpy().tolist()
      overall_pred.append(y_pred[4])
      overall_true.append(y[4])
  return overall_pred, overall_true


def weighted_loss( y_pred, y, criterion, loss_weights):
  loss = 0.0
  for i in range(7):
    y_pred_i, y_i = y_pred[:, i], y[:, i]
    loss += criterion(y_pred_i, y_i) * loss_weights[i]
  return loss


def train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, epochs):
  model.train()
  criterion = nn.MSELoss()
  best_pearson = 0
  for i in range(epochs):
    train_loss_sum = 0
    for idx, (ids, att_msks, y) in enumerate(train_data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      optimizer.zero_grad()
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred), torch.squeeze(y) ## required because y is a vector
      loss = weighted_loss(y_pred, y, criterion, loss_weights)
      loss.backward()
      optimizer.step()
      train_loss_sum += loss.item()

    print(f"Loss at epoch {i}: {train_loss_sum:.4f}")

    ## Determine best epoch model using correlation coefficient for Overall in dev data
    eval_pred_overall, eval_true_overall = predict(model, eval_data_loader)
    curr_pearson = np.corrcoef(eval_pred_overall, eval_true_overall)[0][1]
    print(curr_pearson)
    if curr_pearson > best_pearson:
      best_pearson = curr_pearson
      torch.save(model.state_dict(), model_path)
    


### Training model

In [None]:
from transformers import BertTokenizer, BertModel

## run model finetuning and save fine-tuned model
# pre_trained_model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# pre_trained_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
pre_trained_model = BertModel.from_pretrained("bert-base-multilingual-cased")
# config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
# hidden_size = config.hidden_size
hidden_size = 768
overall_weight = 0.7
loss_weights = [overall_weight if i == 4 else (1-overall_weight)/6 for i in range(7)]

model = Custom_XLMRoberta(pre_trained_model, hidden_size)
model.to(device)

# model_path = f"/content/drive/MyDrive/NLP PROJECT/Finals/ModelParams/XLM_Roberta_0.70_overall.pth"
model_path = f"/content/drive/MyDrive/NLP PROJECT/Finals/ModelParams/BERT_0.70_overall.pth"

## not used if run multiple notebooks concurrently
# for iter in range(1,1000):
#     model_name = 'XLM_Roberta_base'
#     model_name_iter = f"{model_name}_iter_{iter}.pth"
#     model_path = f"/content/drive/MyDrive/NLP PROJECT/Finals/ModelParams/{model_name_iter}"
#     if not os.path.exists(model_path):
#       break

#print(f"Model name for this run: {model_name_iter}")

optimizer = AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, num_epochs)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loss at epoch 0: 1927.8117
0.8185398703768292
Loss at epoch 1: 704.4753
0.8587556091152351
Loss at epoch 2: 478.5713
0.8761291345170551
Loss at epoch 3: 366.2130
0.8742642047474773
Loss at epoch 4: 314.4971
0.8856423707260226


## 4. Evaluation on test data

In [None]:
#### Get data
data_path = LIBRARY_PATH + 'data/processed/'
filename = 'paired_eval.csv'
path = data_path + filename
tmp_file = pd.read_csv(path)

tmp_file_new = tmp_file.drop_duplicates()
# Drop na for text 1 and text 2
test_dropna_text1 = tmp_file_new[tmp_file_new['text1'].notna()]
test_dropna_text1_2 = test_dropna_text1[test_dropna_text1['text2'].notna()]
# Merge data
processed_test_data = merge_clean_columns(test_dropna_text1_2)

processed_test_data = processed_test_data.rename(columns = {'GEO': 'Geography', \
                                                            'ENT': 'Entities', \
                                                            'TIME': 'Time', \
                                                            'NAR': 'Narrative', \
                                                            'STYLE': 'Style', \
                                                            'TONE': 'Tone'})


test_data_loader = get_data_loader(processed_test_data, False)
# row['Geography'],row['Entities'],row['Time'],row['Narrative'],row['Overall'],row['Style'],row['Tone']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

In [None]:
# config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
pre_trained_model = BertModel.from_pretrained("bert-base-multilingual-cased")
# hidden_size = config.hidden_size
hidden_size = 768
# pre_trained_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
model = Custom_XLMRoberta(pre_trained_model, hidden_size)
#model.load_state_dict(torch.load(model_path))
model.load_state_dict(torch.load("/content/drive/MyDrive/NLP PROJECT/Finals/ModelParams/BERT_0.70_overall.pth"), strict=False)
model.to(device)

test_pred_overall, test_true_overall = predict(model, test_data_loader)
test_pearson_score = np.corrcoef(test_pred_overall, test_true_overall)[0][1]

print("Pearson score on test dataset is {:.3f}".format(test_pearson_score))

train_all = get_data_loader(processed_data, False)
train_pred_overall, train_true_overall = predict(model, train_all)
train_pearson_score = np.corrcoef(train_pred_overall, train_true_overall)[0][1]
print("Pearson score on entire train dataset is {:.3f}".format(train_pearson_score))



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pearson score on test dataset is 0.727
Pearson score on entire train dataset is 0.972


In [None]:
train_all = get_data_loader(processed_data, False)
train_pred_overall, train_true_overall = predict(model, train_all)
train_pearson_score = np.corrcoef(train_pred_overall, train_true_overall)[0][1]
print("Pearson score on entire train dataset is {:.3f}".format(train_pearson_score))

Pearson score on entire train dataset is 0.967
