In [None]:
 import tensorflow as tf
# Getting GPU device name.
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch
# If a GPU is available
if torch.cuda.is_available():    
    #set device to GPU   
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If no GPU is available
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
#Importing necessary libraries
!pip install transformers

import re
import scipy
import pandas as pd
import io
import numpy as np
import copy
import seaborn as sns

import transformers
from transformers import RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import class_weight

from torch import nn, optim
from torch.utils import data
from sklearn.decomposition import PCA

#Seeding for deterministic results
RANDOM_SEED = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED) 
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False

# CLASS_NAMES = ['None','MajorClaim','Claim','Premise']
CLASS_NAMES =['Non-ADU','ADU']

MAX_LENGTH = 100                                    
BATCH_SIZE = 4
EPOCHS = 7
HIDDEN_UNITS = 128

tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')  #Use roberta-large or roberta-base

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 38.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [None]:
#Converting labels to numbers
def label_to_int(label):
    if label   == 0:
        return 0
    elif label == 1:
        return 1
    elif label == 2:
        return 1
    elif label == 3:
        return 1

In [None]:
#Converting class names to adu and non-adu
def adu_nonadu(label):
    if label   == 'None':
        return 'Non-ADU'
    elif label == 'MajorClaim':
        return 'ADU'
    elif label == 'Claim':
        return  'ADU'
    elif label == 'Premise':
        return  'ADU'

In [None]:
def processStanceData(df):
                                                   #Concatenating twitter and reddit data
    result1  = df.replace(np.nan, '', regex=True)                               #Getting rid of NaN values

    result1['labelValue'] = result1.ADU_Type.apply(label_to_int)     
    result1['ADU_Type'] = result1.ADU_Type.apply(adu_nonadu)                  #Converting labels to numbers
    result1['TextSrcInre']    = result1['Text']
    result1['Features']    = result1['Sentence_Label'].str.cat(result1['Paragraph_Label'],sep=" ") # persing 2010 paper
    data = result1[['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']].copy()    
    data.columns = ['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']
    return data

In [None]:
# Reading data from AAE premise and claims file as dataFrames

from google.colab import drive
drive.mount('/content/gdrive')

path= F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/Data/" 

Mounted at /content/gdrive


In [None]:
# trainDf=pd.read_csv(path+'model1_train_CMV+AAE.csv',sep="\t",index_col=False)
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model1_dev_CMV+AAE.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# testDf=pd.read_csv(path+'model1_test_CMV+AAE.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)


### ---------- MODEL 1 modified
trainDf=pd.read_csv(path+'model1_train_CMV+AAE.csv',sep="\t",index_col=False)
trainDf=trainDf.drop(columns='Unnamed: 0')
devDf=pd.read_csv(path+'model1_dev_CMV+AAE.csv',sep="\t",index_col=False)
devDf=devDf.drop(columns='Unnamed: 0')
testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
testDf=testDf.drop(columns='Unnamed: 0')

trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
devDf.labelValue=devDf.labelValue.apply(label_to_int)
testDf.labelValue=testDf.labelValue.apply(label_to_int)

### ---------- MODEL 2

# trainDf=pd.read_csv(path+'model_2_train_CMV+AAE.csv',sep="\t",index_col=False)
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model_2_dev_CMV+AAE.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

### ---------- MODEL 3

# trainDf=pd.read_csv(path+'model3_trainDf_aae.csv',sep="\t",index_col=False)
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model3_devDf_aae.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

In [None]:
y_train = trainDf['labelValue'].tolist()
class_weights = class_weight.compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train                                                    
                                    )

class_weights

array([2.87362086, 0.60532432])

In [None]:
print(len(trainDf), len(devDf),len(testDf))

5730 1012 888


In [None]:
print(trainDf.ADU_Type.value_counts())
print(devDf.ADU_Type.value_counts())
print(testDf.ADU_Type.value_counts())

ADU        4733
Non-ADU     997
Name: ADU_Type, dtype: int64
ADU        840
Non-ADU    172
Name: ADU_Type, dtype: int64
ADU        720
Non-ADU    168
Name: ADU_Type, dtype: int64


In [None]:
#Creates a dataset which will be used to feed to RoBERTa
class StanceDataset(data.Dataset):
  def __init__(self,TextSrcInre,Features, labelValue,  tokenizer, max_len):

        self.TextSrcInre = TextSrcInre   
        self.Features = Features
        self.labelValue  = labelValue    
        self.tokenizer   = tokenizer     
        self.max_len     = max_len      

  def __len__(self):
        return len(self.labelValue)

  def __getitem__(self, item):

        TextSrcInre = str(self.TextSrcInre[item])
        Features = str(self.Features[item])


        encoding = tokenizer.encode_plus(
            TextSrcInre,
            Features,
            max_length = self.max_len,
            add_special_tokens= True,
            truncation = True,
            pad_to_max_length = True,
            # padding=True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return {
            'TextSrcInre': TextSrcInre,
            'Features' : Features,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labelValue'  : torch.tensor(self.labelValue[item], dtype=torch.long)
        }


In [None]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
    ds = StanceDataset(

        TextSrcInre = dataframe.TextSrcInre.to_numpy(),
        Features = dataframe.Features.to_numpy(),
        labelValue  = dataframe.labelValue.to_numpy(),
        tokenizer   = tokenizer,
        max_len     = max_len
    )

    return data.DataLoader(
        ds,
        batch_size  = batch_size,
        shuffle     = True,
        num_workers = 2
    )

In [None]:
#Creating data loader for training data
trainDataLoader        = createDataLoader(trainDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for development data
developmentDataLoader  = createDataLoader(devDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(testDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [None]:
print(len(trainDataLoader))
print(len(developmentDataLoader))
print(len(testDataLoader))

1433
253
222


In [None]:
#Instantiating the tf-idf vectorizer object
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))

x_train = trainDf['TextSrcInre'].tolist()
x_train_feats = tfidf.fit(x_train)
print('x_train_feats: ',x_train_feats)
print('length: ',len(x_train_feats.get_feature_names()))


x_train_transform = x_train_feats.transform(x_train)
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print('x_train_transform.shape: ',x_train_transform.shape)


pca = PCA(n_components=128)
p = pca.fit(tfidf_transform_tensor)
# print(p.shape)
print(p)
X = p.transform(tfidf_transform_tensor)
# torch.from_numpy(X.values)
X = torch.from_numpy(X)
# tfidf_transform_tensor_pca = torch.tensor(scipy.sparse.csr_matrix.todense(X)).float()
print(X.type())
print(X.shape)
print(X)


x_train_feats:  TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
length:  2255




x_train_transform.shape:  (5730, 2255)
PCA(n_components=128)
torch.DoubleTensor
torch.Size([5730, 128])
tensor([[-0.0486,  0.0472,  0.0394,  ...,  0.0370,  0.0552, -0.0186],
        [ 0.0255, -0.0752,  0.0100,  ...,  0.0109, -0.0187,  0.0272],
        [ 0.0091, -0.2001,  0.0163,  ..., -0.0071, -0.0624,  0.0087],
        ...,
        [-0.0468, -0.1236,  0.0096,  ...,  0.0614,  0.0498, -0.0152],
        [ 0.0719,  0.0375, -0.0198,  ..., -0.0673,  0.0073,  0.0326],
        [-0.0205,  0.0989, -0.0049,  ...,  0.0531, -0.0612, -0.0165]],
       dtype=torch.float64)


In [None]:
#This class defines the model that was used to pre-train a SNN on TF-IDF features
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names()), HIDDEN_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_UNITS, 2)
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #Returning the ouputs from the hidden layer and the final output layer
        return  y, z
    

In [None]:
snnmodel = Tfidf_Nn()

# model_save_name = 'Mlp_Adu_NonAdu_AAE.pt'
model_save_name = 'Mlp_step1_model_1_modified.pt'
path = F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/{model_save_name}"

snnmodel.load_state_dict(torch.load(path))
snnmodel.eval()



Tfidf_Nn(
  (hidden): Linear(in_features=2255, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (tanh): Tanh()
  (softmax): Softmax(dim=1)
)

In [None]:
'''This class defines the model that will be used for 
training and testing on the dataset.

Adapted from huggingFace
This RoBERTa model from huggingface outputs the last hidden states
and the pooled output by default. Pooled output is the classification 
token (1st token of the last hidden state) further processed by a Linear
layer and a Tanh activation function.

The pre-trained RoBERTa model is used as the primary model.
This class experiments with RoBERTa and its ensemble with TF-IDF features. 
roberta-only :            No ensembling. This just fine-tunes the RoBERTa model. 
                          The pooled output is passed through a linear layer and 
                          softmax function is finally used for preictions. 

roberta-tfIdf :           This model conatenates the 1st token of last-hidden layer
                          from RoBERTa with TF-IDF features. Various ways of this 
                          concatenation was experimented (using pooled output instead
                          of 1st token of last hidden layer etc)

roberta-pcaTfidf :        This model concatenates the pooled output from
                          RoBERTa with the PCA transformed vector.

roberta-preTrainedTfIdf : This model concatenates the pooled output from
                          RoBERTa with the hidden layer output from a pre-trained
                          SNN that was trained on TF-IDF features.

Used dropout to prevent over-fitting.'''

class StanceClassifier(nn.Module):

    def __init__(self,  n_classes):
        super(StanceClassifier, self).__init__()
        self.robertaModel              = RobertaModel.from_pretrained('roberta-large')    #use roberta-large or roberta-base
        self.model_TFIDF               = snnmodel                                        #Pre-trained SNN trained with TF-IDF features

        self.drop                      = nn.Dropout(p = 0.3)

        self.output                    = nn.Linear(self.robertaModel.config.hidden_size, n_classes)

        self.input_size_tfidf_only     = self.robertaModel.config.hidden_size + len(tfidf.get_feature_names())
        self.input_size_tfidf_pca      = self.robertaModel.config.hidden_size + HIDDEN_UNITS

        self.dense                     = nn.Linear( self.input_size_tfidf_only,  self.input_size_tfidf_only)
        self.out_proj                  = nn.Linear( self.input_size_tfidf_only, n_classes)
        self.out_pca                   = nn.Linear( self.input_size_tfidf_pca, n_classes)

        self.input_size_preTrain_tfidf = self.robertaModel.config.hidden_size +  HIDDEN_UNITS 
        self.out                       = nn.Linear(self.input_size_preTrain_tfidf, n_classes)

        self.softmax                   = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask, inputs_tfidf_feats, pca_transformed_feats, modelType):
        roberta_output     = self.robertaModel(
            input_ids      = input_ids,               #Input sequence tokens
            attention_mask = attention_mask )         #Mask to avoid performing attention on padding tokens
    #print(roberta_output[1].shape)
        if modelType   == 'roberta-only':
            pooled_output = roberta_output[1]           #Using pooled output
            output        = self.drop(pooled_output)
            output        = self.output(output)

        elif modelType == 'roberta-tfIdf':
            soutput = roberta_output[1]#---------        experimenting with pooled output 
            #soutput = roberta_output[0][:, 0, :]        #taking <s> token (equivalent to [CLS] token in BERT)
            x       = torch.cat((soutput, inputs_tfidf_feats) , dim=1)
            x       = self.drop(x)
            output  = self.out_proj(x)

        elif modelType == 'roberta-pcaTfidf':
            soutput = roberta_output[1]
            x       = torch.cat((soutput, pca_transformed_feats) , dim=1)
            x       = self.drop(x)
            output  = self.out_pca(x)

        elif modelType == 'roberta-TrainedTfIdf':
            tfidf_hidddenLayer, tfidf_output = self.model_TFIDF(inputs_tfidf_feats)
            #print(tfidf_hidddenLayer.shape)
            #print(tfidf_output.shape)

          #Conactenating pooled output from RoBERTa with the hidden layer from the pre-trained SNN using TF-IDF features. 
          #pooled_output = torch.cat((roberta_output[1], tfidf_output) , dim=1)-------- Experimenting with Output of pre-trained SNN 
            pooled_output = torch.cat((roberta_output[1], tfidf_hidddenLayer) , dim=1)
            output        = self.drop(pooled_output)
            output        = self.out(output)

        return self.softmax(output)



In [None]:
#Instantiating a StanceClassifier object as our model and loading the model onto the GPU.
model = StanceClassifier(len(CLASS_NAMES))
model = model.to(device)
#print(model)

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
'''Using the same optimiser as used in BERT paper
with a different learning rate'''
optimizer = AdamW(model.parameters(), 
                  lr = 2e-6, 
                  # lr = 1e-5,
                  correct_bias= False)

totalSteps = len(trainDataLoader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps = totalSteps
)

'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
finally been used here'''

# weights      = [1.0, 1.0, 1.0, 1.0]
weights = class_weights
classWeights = torch.FloatTensor(weights)
lossFunction = nn.CrossEntropyLoss(weight = classWeights).to(device)



In [None]:
#This function is used for training the model with 'roberta-TrainedTfIdf'. 
def train_epoch(
  model,
  dataLoader,
  lossFunction,
  optimizer,
  device,
  scheduler,
  n_examples
):
    model = model.train()
    losses = []
    correctPredictions = 0

    for d in dataLoader:
    
        input_ids              = d["input_ids"].to(device)                           #Loading input ids to GPU
        attention_mask         = d["attention_mask"].to(device)                      #Loading attention mask to GPU
        labelValues            = d["labelValue"].to(device)                          #Loading label value to GPU
        textSrcInre            = d["TextSrcInre"]
        Features               = d["Features"]                                  
        tfidf_transform        = x_train_feats.transform(textSrcInre)
        tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()   
        pca_tensor             = p.transform(tfidf_transform_tensor)

        pca_tensor = torch.from_numpy(pca_tensor).float()
        pca_tensor = pca_tensor.to(device)
        tfidf_transform_tensor = tfidf_transform_tensor.to(device)

        #Getting the output from our model (Object of StanceClassification class) for train data
        outputs = model(
          input_ids             = input_ids,
          attention_mask        = attention_mask,
          inputs_tfidf_feats    = tfidf_transform_tensor,
          pca_transformed_feats = pca_tensor,
          modelType             = 'roberta-TrainedTfIdf'
        )

        #Determining the model predictions
        _, predictionIndices = torch.max(outputs, dim=1)
        loss = lossFunction(outputs, labelValues)

        #Calculating the correct predictions for accuracy
        correctPredictions += torch.sum(predictionIndices == labelValues)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses), correctPredictions.double() / n_examples


In [None]:
#This function is used for evaluating the model on the development and test set
def eval_model(
    model, 
    dataLoader, 
    lossFunction,
    device,
    n_examples
    ):
    model = model.eval()
    losses = []
    correctPredictions = 0

    with torch.no_grad():
        for d in dataLoader:
            input_ids              = d["input_ids"].to(device)                          #Loading input ids to GPU
            attention_mask         = d["attention_mask"].to(device)                     #Loading attention mask to GPU
            labelValues            = d["labelValue"].to(device)                         #Loading label values to GPU
            textSrcInre            = d["TextSrcInre"]
            tfidf_transform        = x_train_feats.transform(textSrcInre)
            tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()    

            pca_tensor             = p.transform(tfidf_transform_tensor)

            pca_tensor = torch.from_numpy(pca_tensor).float()
            pca_tensor = pca_tensor.to(device)
            tfidf_transform_tensor = tfidf_transform_tensor.to(device)

            #Getting the softmax output from model for dev data
            outputs = model(
            input_ids             = input_ids,
            attention_mask        = attention_mask,
            inputs_tfidf_feats    = tfidf_transform_tensor,
            pca_transformed_feats = pca_tensor,
            modelType             = 'roberta-TrainedTfIdf'
            )

            #Determining the model predictions
            _, predictionIndices = torch.max(outputs, dim=1)
            loss = lossFunction(outputs, labelValues)

            #Calculating the correct predictions for accuracy
            correctPredictions += torch.sum(predictionIndices == labelValues)
            losses.append(loss.item())

    return np.mean(losses), correctPredictions.double() / n_examples


In [None]:
#fine tuning ROBERTa and validating it 

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}')
    trainLoss, trainAccuracy = train_epoch(
        model,
        trainDataLoader,
        lossFunction,
        optimizer,
        device,
        scheduler,
        len(trainDf)
      )
    print(f'Training loss {trainLoss} Training accuracy {trainAccuracy}')
    devLoss, devAccuracy = eval_model(
        model,
        developmentDataLoader,
        lossFunction,
        device,
        len(devDf)
      )
    print(f'Development loss {devLoss} Development accuracy {devAccuracy}')
    print()
    print()

Epoch 1


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.49331903172720587 Training accuracy 0.8668411867364747




Development loss 0.47815606516340503 Development accuracy 0.875494071146245


Epoch 2


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.4728093569876011 Training accuracy 0.8792321116928448




Development loss 0.48174248464965064 Development accuracy 0.875494071146245


Epoch 3


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.48200922345499514 Training accuracy 0.8877835951134381




Development loss 0.47776027100359497 Development accuracy 0.8962450592885375


Epoch 4


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.46928191164200406 Training accuracy 0.899825479930192




Development loss 0.45007069461901666 Development accuracy 0.91699604743083


Epoch 5


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.4407467870594152 Training accuracy 0.9197207678883073




Development loss 0.4377692511430371 Development accuracy 0.9229249011857706


Epoch 6


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.4206650354233307 Training accuracy 0.9289703315881327




Development loss 0.4312137328824507 Development accuracy 0.924901185770751


Epoch 7


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Training loss 0.4193476529405904 Training accuracy 0.9338568935427575




Development loss 0.44239405987291 Development accuracy 0.9229249011857706




In [None]:
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

    model = model.eval()
    review_texta = []
#     review_textb = []               #     !! Change - commented
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            textSrcInre                 = d["TextSrcInre"]
#             textbs                 = d["secondSeq"]
            input_ids              = d["input_ids"].to(device)
            attention_mask         = d["attention_mask"].to(device)
            labels                 = d["labelValue"].to(device)
            # Features            = d["Features"]
            tfidf_transform        = tfidf.transform(textSrcInre)
            tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

            pca_tensor             =  p.transform(tfidf_transform_tensor)

            pca_tensor = torch.from_numpy(pca_tensor).float()
            pca_tensor = pca_tensor.to(device)
            tfidf_transform_tensor = tfidf_transform_tensor.to(device)

            #Getting the softmax output from model
            outputs = model(
                input_ids             = input_ids,
                attention_mask        = attention_mask,
                inputs_tfidf_feats    = tfidf_transform_tensor,
                pca_transformed_feats = pca_tensor,
                modelType             = 'roberta-TrainedTfIdf'
                )
            _, preds = torch.max(outputs, dim=1)     #Determining the model predictions

            review_texta.extend(textSrcInre)
#             review_textb.extend(textbs)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(labels)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
  
    return review_texta, predictions, prediction_probs, real_values
#    return review_texta, review_textb, predictions, prediction_probs, real_values


In [None]:
#Getting model predictions on dev dataset
# firstSeq_dev, secondSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
#   model,
#   developmentDataLoader
# )

firstSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  model,
  developmentDataLoader
)



In [None]:
 #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES,digits=4))
print(confusion_matrix(yTest_dev, yHat_dev))

              precision    recall  f1-score   support

     Non-ADU     0.8219    0.6977    0.7547       172
         ADU     0.9400    0.9690    0.9543       840

    accuracy                         0.9229      1012
   macro avg     0.8809    0.8334    0.8545      1012
weighted avg     0.9199    0.9229    0.9204      1012

[[120  52]
 [ 26 814]]


In [None]:
# torch.save(model.state_dict(),f'RoBERTaLarge_TFIDFV2.pt')

# #for collab 
# #Saving the model onto the drive
# from google.colab import drive
# drive.mount('/content/gdrive')

# model_save_name = 'RoBERTaLarge_TFIDF_cmv_Step1.pt'
# path = F"/content/gdrive/My Drive/Colab Notebooks/{model_save_name}" 
# torch.save(model.state_dict(), path)

In [None]:
#Getting model predictions on test dataset
firstSeq_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)



In [None]:
#Printing classification report for test dataset (Evaluating the model on test set)
print(classification_report(yTest_test, yHat_test, target_names= CLASS_NAMES,digits=4))
print(confusion_matrix(yTest_test, yHat_test))

              precision    recall  f1-score   support

     Non-ADU     0.8543    0.7679    0.8088       168
         ADU     0.9471    0.9694    0.9581       720

    accuracy                         0.9313       888
   macro avg     0.9007    0.8687    0.8835       888
weighted avg     0.9295    0.9313    0.9299       888

[[129  39]
 [ 22 698]]


**Prediction Starts here**

In [None]:
#Creates a dataset which will be used to feed to RoBERTa
class StanceIcleDataset(data.Dataset):
  def __init__(self,TextSrcInre,  tokenizer, max_len):
        self.TextSrcInre = TextSrcInre   #Concatenation of reply+ previous+ src text to get features from 1 training example
        self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
        self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
        return len(self.TextSrcInre)

  def __getitem__(self, item):
        # firstSeq    = str(self.firstSeq[item])
        # secondSeq   = str(self.secondSeq[item])
        TextSrcInre = str(self.TextSrcInre[item])

    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
        encoding = tokenizer.encode_plus(
            TextSrcInre,
            max_length = self.max_len,
            add_special_tokens= True,
            truncation = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return {
            'TextSrcInre': TextSrcInre,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }


In [None]:
#Creates a data loader
def createIcleDataLoader(dataframe, tokenizer, max_len, batch_size):
    ds = StanceIcleDataset(
        TextSrcInre = dataframe.Text.to_numpy(),
        tokenizer   = tokenizer,
        max_len     = max_len
    )

    return data.DataLoader(
        ds,
        batch_size  = batch_size,
        shuffle     = False,
        num_workers = 2
    )

In [None]:
#This function gets the predictions from the model after it is trained.
def get_icle_predictions(model, data_loader):

    model = model.eval()
    review_texta = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            textSrcInre                 = d["TextSrcInre"]
            input_ids              = d["input_ids"].to(device)
            attention_mask         = d["attention_mask"].to(device)
            tfidf_transform        = tfidf.transform(textSrcInre)
            tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

            pca_tensor             =  p.transform(tfidf_transform_tensor)

            pca_tensor = torch.from_numpy(pca_tensor).float()
            pca_tensor = pca_tensor.to(device)
            tfidf_transform_tensor = tfidf_transform_tensor.to(device)

            #Getting the softmax output from model
            outputs = model(
                input_ids             = input_ids,
                attention_mask        = attention_mask,
                inputs_tfidf_feats    = tfidf_transform_tensor,
                pca_transformed_feats = pca_tensor,
                modelType             = 'roberta-TrainedTfIdf'
                )
            _, preds = torch.max(outputs, dim=1)     #Determining the model predictions

            review_texta.extend(textSrcInre)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
  
    return review_texta, predictions, prediction_probs


In [None]:
def int_to_label(label):
    if label   == 0:
        return 'Non-ADU'
    elif label == 1:
        return 'ADU'

In [None]:
#Creating data loader for ICLE 5085 data
# path2= F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/ICLE_5085.xlsx" 
path2= F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/Annotations_persing_ICLE_1000.xlsx" 
icle_df=pd.read_excel(path2,sheet_name='ADU_Classification',usecols=['Text'])

ICLE_DataLoader  = createIcleDataLoader(icle_df, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [None]:
firstSeq_ICLE, yHat_icle, predProbs_icle = get_icle_predictions(
  model,
  ICLE_DataLoader
)



In [None]:
# firstSeq_ICLE
yHat_icle.unique()

tensor([0, 1])

In [None]:
from google.colab import files 

predictions=pd.DataFrame(list(zip(firstSeq_ICLE, yHat_icle.numpy())),columns=['Text','Ann_Adu'])
predictions['Ann_Adu']=predictions.Ann_Adu.apply(int_to_label)
predictions['Ann_Adu'].value_counts()
predictions.to_excel('ICLE_ADU_NonADU.xlsx')
files.download('ICLE_ADU_NonADU.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>