In [1]:
import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy

import torch

from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.utils                    import class_weight


from torch                            import nn, optim
from torch.utils                      import data

In [2]:
#Seeding for deterministic results
RANDOM_SEED = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
HIDDEN_LAYER_UNITS = 128

# CLASS_NAMES = ['support', 'deny', 'query', 'comment']
CLASS_NAMES = ['MajorClaim','Claim','Premise']

EPOCHS      = 50

In [3]:
#Converting labels to numbers
def label_to_int(label):
    if label == 1:
        return 0
    elif label == 2:
        return 1
    elif label == 3:
        return 2

In [4]:
def processStanceData(df):
    result1  = df.replace(np.nan, '', regex=True)                               #Getting rid of NaN values
    result1['labelValue'] = result1.ADU_Type.apply(label_to_int)                  #Converting labels to numbers             
    # result1['SrcInre']    = result1['Text'].str.cat(result1['Topic'],sep=" ")        #Change
    result1['TextSrcInre']    = result1['Text']
    result1['Features']    = result1['Sentence_Label'].str.cat(result1['Paragraph_Label'],sep=" , ") 
    data = result1[['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']].copy()    

# #      replyText           - the reply post (whose stance towards the target needs to be learnt)
# #      replyTextId         - the ID of the reply post
# #      previousText        - the text to which replyText was replied
# #      sourceText          - the source post of the conversation thread
# #      label               - the label value assigned to each post
# #      previoysPlusSrctext - the concatenation of the previousText and the sourceText
# #      labelValue          - the numberic value assigned to each label

    data.columns = ['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']
    return data

In [5]:
# Reading data from AAE premise and claims file as dataFrames
from google.colab import drive
drive.mount('/content/gdrive')
path = F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/Data/" 

Mounted at /content/gdrive


In [6]:
# trainDf=pd.read_csv(path+'model1_train_CMV+AAE.csv',sep="\t",index_col=False)
# trainDf=trainDf.loc[trainDf.labelValue!=0]
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model1_dev_CMV+AAE.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# devDf=devDf.loc[devDf.labelValue!=0]
# testDf=pd.read_csv(path+'model1_test_CMV+AAE.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')
# testDf=testDf.loc[testDf.labelValue!=0]

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

####--------- MODEL 1 modified 

trainDf=pd.read_csv(path+'model1_train_CMV+AAE.csv',sep="\t",index_col=False)
trainDf=trainDf.loc[trainDf.labelValue!=0]
trainDf=trainDf.drop(columns='Unnamed: 0')
devDf=pd.read_csv(path+'model1_dev_CMV+AAE.csv',sep="\t",index_col=False)
devDf=devDf.drop(columns='Unnamed: 0')
devDf=devDf.loc[devDf.labelValue!=0]
testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
testDf=testDf.drop(columns='Unnamed: 0')
testDf=testDf.loc[testDf.labelValue!=0]

trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
devDf.labelValue=devDf.labelValue.apply(label_to_int)
testDf.labelValue=testDf.labelValue.apply(label_to_int)



####--------------- MODEL2

# trainDf=pd.read_csv(path+'model_2_train_CMV+AAE.csv',sep="\t",index_col=False)
# trainDf=trainDf.loc[trainDf.labelValue!=0]
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model_2_dev_CMV+AAE.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# devDf=devDf.loc[devDf.labelValue!=0]
# testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')
# testDf=testDf.loc[testDf.labelValue!=0]

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

####--------------- MODEL3

# trainDf=pd.read_csv(path+'model3_trainDf_aae.csv',sep="\t",index_col=False)
# trainDf=trainDf.loc[trainDf.labelValue!=0]
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model3_devDf_aae.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# devDf=devDf.loc[devDf.labelValue!=0]
# testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')
# testDf=testDf.loc[testDf.labelValue!=0]

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

In [7]:
trainDf.labelValue.value_counts()

2    2515
1    1698
0     520
Name: labelValue, dtype: int64

In [8]:
devDf.labelValue.value_counts()

2    441
1    306
0     93
Name: labelValue, dtype: int64

In [9]:
y_train=trainDf.labelValue

In [10]:
print(len(trainDf), len(devDf),len(testDf))
print(trainDf.labelValue.value_counts(), devDf.labelValue.value_counts(),testDf.labelValue.value_counts())

4733 840 720
2    2515
1    1698
0     520
Name: labelValue, dtype: int64 2    441
1    306
0     93
Name: labelValue, dtype: int64 2    450
1    190
0     80
Name: labelValue, dtype: int64


In [11]:
class_weights = class_weight.compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train                                                   
                                    )
class_weights

array([3.03397436, 0.92913231, 0.62730285])

In [12]:
x_train = trainDf['TextSrcInre'].tolist()
y_train = trainDf['labelValue'].tolist()
x_dev  = devDf['TextSrcInre'].tolist()
y_dev  = devDf['labelValue'].tolist()
x_test = testDf['TextSrcInre'].tolist()
y_test = testDf['labelValue'].tolist()

#Instantiating TfidfVectorizer object and fitting it on the training set
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))
print(tfidf)
x_train_feats = tfidf.fit(x_train)
print('x_train_feats: ',x_train_feats)
print('length: ',len(x_train_feats.get_feature_names()))

x_train_transform = x_train_feats.transform(x_train)
#Converting the TF-IDF matrix to tensor
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print('x_train_transform.shape: ',x_train_transform.shape)

#Tranforming the development and test data to tf-idf matrix
x_dev  = tfidf.transform(x_dev)
x_test = tfidf.transform(x_test)

x_dev  = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
x_train_feats:  TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
length:  1872




x_train_transform.shape:  (4733, 1872)


In [13]:
# x_train

In [14]:
#Converting prections for train, dev and test data to tensors
y_train = torch.tensor(y_train)
y_dev   = torch.tensor(y_dev)
y_test  = torch.tensor(y_test)

In [15]:
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names()), HIDDEN_LAYER_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_LAYER_UNITS, len(CLASS_NAMES))
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh    = nn.Tanh()                                     #Using tanh as it performed better than ReLu during hyper-param optimisation
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of the below operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #returning the output from hidden layer and the output layer
        return  y, z
    

In [16]:
#Defining the model
model = Tfidf_Nn()

# Defining the loss
'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
 finally been used here'''

weights       = class_weights
class_weights = torch.FloatTensor(weights)
criterion     = nn.CrossEntropyLoss(weight = class_weights)


# Forward pass, get our logits
hidden_state_output, classfier_output = model(tfidf_transform_tensor)
print(classfier_output)
print(classfier_output[0].shape)

loss = criterion(classfier_output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.02)



tensor([[0.3472, 0.3503, 0.3025],
        [0.3493, 0.3506, 0.3001],
        [0.3491, 0.3507, 0.3001],
        ...,
        [0.3445, 0.3522, 0.3033],
        [0.3487, 0.3511, 0.3001],
        [0.3460, 0.3532, 0.3008]], grad_fn=<SoftmaxBackward0>)
torch.Size([3])


In [17]:
#Training the model on training data and evaluating it on development set
#%%time
def train_model():
    train_losses = []
    dev_losses = []
    dev_accuracies = []

    for e in range(EPOCHS):
        correct_predictions = 0
        optimizer.zero_grad()

        hidden_layer_output, classifier_output = model.forward(tfidf_transform_tensor)

        loss = criterion(classifier_output, y_train)
        loss.backward()
        train_loss = loss.item()
        train_losses.append(train_loss)

        optimizer.step()
        with torch.no_grad():
            model.eval()

            #Getting hidden layer and softmax output from model for dev data
            hidden_layer_output, classifier_output = model(x_dev)

            #Calculating loss
            dev_loss = criterion(classifier_output, y_dev)
            dev_losses.append(dev_loss)

            #Calculating values predicted by the model
            _, preds = torch.max(classifier_output, dim=1)
            correct_predictions += torch.sum(preds == y_dev)

            #Calculating accuracy
            dev_accuracy = correct_predictions.double() / len(y_dev)
            dev_accuracies.append(dev_accuracy)

        model.train()

        print(f"Epoch: {e+1}/{EPOCHS}.. ",
              f"Training Loss: {dev_loss:.3f}.. ",
              f"Dev Loss: {dev_loss:.3f}.. ",
              f"Dev Accuracy: {dev_accuracy:.3f}")


train_model()

Epoch: 1/50..  Training Loss: 1.076..  Dev Loss: 1.076..  Dev Accuracy: 0.614
Epoch: 2/50..  Training Loss: 1.038..  Dev Loss: 1.038..  Dev Accuracy: 0.556
Epoch: 3/50..  Training Loss: 0.994..  Dev Loss: 0.994..  Dev Accuracy: 0.565
Epoch: 4/50..  Training Loss: 0.955..  Dev Loss: 0.955..  Dev Accuracy: 0.617
Epoch: 5/50..  Training Loss: 0.928..  Dev Loss: 0.928..  Dev Accuracy: 0.635
Epoch: 6/50..  Training Loss: 0.911..  Dev Loss: 0.911..  Dev Accuracy: 0.636
Epoch: 7/50..  Training Loss: 0.901..  Dev Loss: 0.901..  Dev Accuracy: 0.625
Epoch: 8/50..  Training Loss: 0.895..  Dev Loss: 0.895..  Dev Accuracy: 0.623
Epoch: 9/50..  Training Loss: 0.893..  Dev Loss: 0.893..  Dev Accuracy: 0.625
Epoch: 10/50..  Training Loss: 0.895..  Dev Loss: 0.895..  Dev Accuracy: 0.632
Epoch: 11/50..  Training Loss: 0.899..  Dev Loss: 0.899..  Dev Accuracy: 0.642
Epoch: 12/50..  Training Loss: 0.904..  Dev Loss: 0.904..  Dev Accuracy: 0.639
Epoch: 13/50..  Training Loss: 0.909..  Dev Loss: 0.909..  De

In [18]:
'''This function gets the predictions for each data point 
in the deevelopment and the training set'''

def get_predictions(model, x_test, y_test):
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        model.eval()
        labels = y_test

        #Currently, not interested in the hidden layer outputs.
        _,classifier_output = model(x_test)

        #Not interested in the maximum values, interested with the indices of these max values
        _, preds = torch.max(classifier_output, dim=1)

        predictions.extend(preds)
        prediction_probs.extend(classifier_output)
        real_values.extend(labels)
    predictions = torch.stack(predictions)

    prediction_probs = torch.stack(prediction_probs)
    real_values = torch.stack(real_values)
    return  predictions, prediction_probs, real_values

In [19]:
#Getting predictions for the development set
y_pred_dev, y_pred_probs, y_true_dev = get_predictions(
  model,
  x_dev, 
  y_dev
)

In [20]:
#Printing the classifictaion report for the Development set
print(classification_report(y_true_dev, y_pred_dev ,digits =4, target_names=CLASS_NAMES))

              precision    recall  f1-score   support

  MajorClaim     0.5977    0.5591    0.5778        93
       Claim     0.5651    0.5817    0.5733       306
     Premise     0.7146    0.7098    0.7122       441

    accuracy                         0.6464       840
   macro avg     0.6258    0.6169    0.6211       840
weighted avg     0.6472    0.6464    0.6467       840



In [21]:
#Getting the predictions for the test set
y_pred_test, y_pred_probs, y_true_test = get_predictions(
  model,
  x_test, 
  y_test
)

In [22]:
print(classification_report(y_true_test, y_pred_test , digits = 4,  target_names=CLASS_NAMES))

              precision    recall  f1-score   support

  MajorClaim     0.7447    0.8750    0.8046        80
       Claim     0.6389    0.7263    0.6798       190
     Premise     0.8951    0.8156    0.8535       450

    accuracy                         0.7986       720
   macro avg     0.7596    0.8056    0.7793       720
weighted avg     0.8108    0.7986    0.8022       720



In [23]:
# torch.save(model.state_dict(), f'Mlp_AAE_1682.pt')
from google.colab import drive
drive.mount('/content/gdrive')

model_save_name = 'Mlp_Step2_model_1_modified.pt'
path = F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/{model_save_name}" 
torch.save(model.state_dict(), path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
