In [1]:
import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy

import torch

from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.utils                    import class_weight

from sklearn.model_selection import train_test_split
from torch                            import nn, optim
from torch.utils                      import data

In [2]:
#Seeding for deterministic results
RANDOM_SEED = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
HIDDEN_LAYER_UNITS = 128

# CLASS_NAMES = ['support', 'deny', 'query', 'comment']
# CLASS_NAMES = ['None','MajorClaim','Claim','Premise']
CLASS_NAMES =['Non-ADU','ADU']

EPOCHS      = 50

In [3]:
#Converting labels to numbers
def label_to_int(label):
    if label   == 0:
        return 0
    elif label == 1:
        return 1
    elif label == 2:
        return 1
    elif label == 3:
        return 1

In [4]:
#Converting labels to numbers
def adu_nonadu(label):
    if label   == 'None':
        return 'Non-ADU'
    elif label == 'MajorClaim':
        return 'ADU'
    elif label == 'Claim':
        return  'ADU'
    elif label == 'Premise':
        return  'ADU'

In [5]:
def processStanceData(df):
    result1  = df.replace(np.nan, '', regex=True)                               #Getting rid of NaN values

    result1['labelValue'] = result1.ADU_Type.apply(label_to_int)       
    result1['ADU_Type'] = result1.ADU_Type.apply(adu_nonadu)                 #Converting labels to numbers
    result1['TextSrcInre']    = result1['Text']
    result1['Features']    = result1['Sentence_Label'].str.cat(result1['Paragraph_Label'],sep=" , ") 
    data = result1[['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']].copy()    
    data.columns = ['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']
    return data

def processStanceData_cmv(df):
    result1  = df.replace(np.nan, '', regex=True)                               #Getting rid of NaN values

    result1['labelValue'] = result1.ADU_Type.apply(label_to_int)       
    result1['ADU_Type'] = result1.ADU_Type.apply(adu_nonadu)                 #Converting labels to numbers
    result1['TextSrcInre']    = result1['Text']
    result1['Features']    = result1['Sentence_Label'] 
    result1['Para_No']    = ''
    data = result1[['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']].copy()    
    data.columns = ['Text','Topic','TextSrcInre','Para_No','ADU_Type','labelValue','Features']
    return data

In [None]:
# Reading data from AAE premise and claims file as dataFrames
from google.colab import drive
drive.mount('/content/gdrive')
path = F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/Data/" 

In [None]:
# trainDf=pd.read_csv(path+'model1_train_CMV+AAE.csv',sep="\t",index_col=False)
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model1_dev_CMV+AAE.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# testDf=pd.read_csv(path+'model1_test_CMV+AAE.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

###--- MODEL 1 modified
trainDf=pd.read_csv(path+'model1_train_CMV+AAE.csv',sep="\t",index_col=False)
trainDf=trainDf.drop(columns='Unnamed: 0')
devDf=pd.read_csv(path+'model1_dev_CMV+AAE.csv',sep="\t",index_col=False)
devDf=devDf.drop(columns='Unnamed: 0')
testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
testDf=testDf.drop(columns='Unnamed: 0')

trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
devDf.labelValue=devDf.labelValue.apply(label_to_int)
testDf.labelValue=testDf.labelValue.apply(label_to_int)



# ### ---------- MODEL 2

# trainDf=pd.read_csv(path+'model_2_train_CMV+AAE.csv',sep="\t",index_col=False)
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model_2_dev_CMV+AAE.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

### ---------- MODEL 3

# trainDf=pd.read_csv(path+'model3_trainDf_aae.csv',sep="\t",index_col=False)
# trainDf=trainDf.drop(columns='Unnamed: 0')
# devDf=pd.read_csv(path+'model3_devDf_aae.csv',sep="\t",index_col=False)
# devDf=devDf.drop(columns='Unnamed: 0')
# testDf=pd.read_csv(path+'model_2_3_testDf_aae.csv',sep="\t",index_col=False)
# testDf=testDf.drop(columns='Unnamed: 0')

# trainDf.labelValue=trainDf.labelValue.apply(label_to_int)
# devDf.labelValue=devDf.labelValue.apply(label_to_int)
# testDf.labelValue=testDf.labelValue.apply(label_to_int)

In [None]:
y_train=trainDf.labelValue

In [None]:
print(len(trainDf), len(devDf),len(testDf))
print((trainDf.labelValue.value_counts()), (devDf.labelValue.value_counts()),(testDf.labelValue.value_counts()))

In [None]:
trainDf.head()

In [None]:
x_train = trainDf['TextSrcInre'].tolist()
y_train = trainDf['labelValue'].tolist()
x_dev  = devDf['TextSrcInre'].tolist()
y_dev  = devDf['labelValue'].tolist()
x_test = testDf['TextSrcInre'].tolist()
y_test = testDf['labelValue'].tolist()

#Instantiating TfidfVectorizer object and fitting it on the training set
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))
print(tfidf)
x_train_feats = tfidf.fit(x_train)
print('x_train_feats: ',x_train_feats)
print('length: ',len(x_train_feats.get_feature_names()))

x_train_transform = x_train_feats.transform(x_train)
#Converting the TF-IDF matrix to tensor
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print('x_train_transform.shape: ',x_train_transform.shape)

#Tranforming the development and test data to tf-idf matrix
x_dev  = tfidf.transform(x_dev)
x_test = tfidf.transform(x_test)

x_dev  = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

In [None]:
class_weights = class_weight.compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train                                                   
                                    )
class_weights

In [None]:
#Converting prections for train, dev and test data to tensors
y_train = torch.tensor(y_train)
y_dev   = torch.tensor(y_dev)
y_test  = torch.tensor(y_test)

In [None]:
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names()), HIDDEN_LAYER_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_LAYER_UNITS, len(CLASS_NAMES))
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh    = nn.Tanh()                                     #Using tanh as it performed better than ReLu during hyper-param optimisation
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of the below operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #returning the output from hidden layer and the output layer
        return  y, z
    

In [None]:
#Defining the model
model = Tfidf_Nn()

# Defining the loss
'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
 finally been used here'''

weights       = class_weights
class_weights = torch.FloatTensor(weights)
criterion     = nn.CrossEntropyLoss(weight = class_weights)


# Forward pass, get our logits
hidden_state_output, classfier_output = model(tfidf_transform_tensor)
print(classfier_output)
print(classfier_output[0].shape)

loss = criterion(classfier_output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.02)

In [None]:
#Training the model on training data and evaluating it on development set
#%%time
def train_model():
    train_losses = []
    dev_losses = []
    dev_accuracies = []

    for e in range(EPOCHS):
        correct_predictions = 0
        optimizer.zero_grad()

        hidden_layer_output, classifier_output = model.forward(tfidf_transform_tensor)

        loss = criterion(classifier_output, y_train)
        loss.backward()
        train_loss = loss.item()
        train_losses.append(train_loss)

        optimizer.step()
        with torch.no_grad():
            model.eval()

            #Getting hidden layer and softmax output from model for dev data
            hidden_layer_output, classifier_output = model(x_dev)

            #Calculating loss
            dev_loss = criterion(classifier_output, y_dev)
            dev_losses.append(dev_loss)

            #Calculating values predicted by the model
            _, preds = torch.max(classifier_output, dim=1)
            correct_predictions += torch.sum(preds == y_dev)

            #Calculating accuracy
            dev_accuracy = correct_predictions.double() / len(y_dev)
            dev_accuracies.append(dev_accuracy)

        model.train()

        print(f"Epoch: {e+1}/{EPOCHS}.. ",
              f"Training Loss: {dev_loss:.3f}.. ",
              f"Dev Loss: {dev_loss:.3f}.. ",
              f"Dev Accuracy: {dev_accuracy:.3f}")


train_model()

In [None]:
'''This function gets the predictions for each data point 
in the deevelopment and the training set'''

def get_predictions(model, x_test, y_test):
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        model.eval()
        labels = y_test

        #Currently, not interested in the hidden layer outputs.
        _,classifier_output = model(x_test)

        #Not interested in the maximum values, interested with the indices of these max values
        _, preds = torch.max(classifier_output, dim=1)

        predictions.extend(preds)
        prediction_probs.extend(classifier_output)
        real_values.extend(labels)
    predictions = torch.stack(predictions)

    prediction_probs = torch.stack(prediction_probs)
    real_values = torch.stack(real_values)
    return  predictions, prediction_probs, real_values

In [None]:
#Getting predictions for the development set
y_pred_dev, y_pred_probs, y_true_dev = get_predictions(
  model,
  x_dev, 
  y_dev
)

In [None]:
#Printing the classifictaion report for the Development set
print(classification_report(y_true_dev, y_pred_dev ,digits =4, target_names=CLASS_NAMES))

In [None]:
#Getting the predictions for the test set
y_pred_test, y_pred_probs, y_true_test = get_predictions(
  model,
  x_test, 
  y_test
)

In [None]:
print(classification_report(y_true_test, y_pred_test , digits = 4,  target_names=CLASS_NAMES))

In [None]:
# torch.save(model.state_dict(), f'Mlp_AAE_1682.pt')
from datetime import datetime
from google.colab import drive
drive.mount('/content/gdrive')

model_save_name = F"Mlp_step1_model_1_modified.pt"
path = F"/content/gdrive/My Drive/Colab Notebooks/1. ADU_Classification/{model_save_name}" 
torch.save(model.state_dict(), path)