<a href="https://colab.research.google.com/github/udbhav-chugh/TextCoherenceOnSocialMedia/blob/master/TextCoherenceModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook contains the code for training and testing the text coherence model

*   Note you must have previously run the notebook to preprocess and generate word embeddings for each of the datasets. This notebook trains and tests models for three datasets:
  * One is the generated Social Media dataset which considers posts and comments as separate documents
  * One is the generated Social Media dataset which considers posts and top 10 comments as single documents
  * One is the Accidents Report dataset which is the standard dataset used for text coherence analysis.
*   We train our model on all three datasets, and for each model, we test on all three sets. See the report for result analysis.
* Ensure the word embeddings for the training data and testing data are present in your google drive. For our notebook, we kept a folder named nlp in google drive which had three sub-folders: separate, combined and accident. All three folders had 2 files each: the word embeddings for training and testing dataset.
* Once the word embeddings are present in the required folder with names mentioned in the notebook, you can easily run the notebook.
* It takes a few hours for training and testing to be complete.



In [None]:
import inspect
import os
from os import listdir
from os.path import isfile, join
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.autograd import Function, Variable
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torchvision.models as models
from torch.optim import Adam, SGD
import numpy as np
import time
import cv2
import math
import pickle

torch.manual_seed(191009)




<torch._C.Generator at 0x7f45c2109588>

# The proposed architecture code to get coherence score for a window of 3 sentences.

In [None]:
class coherenceModel(nn.Module):
  def __init__(self):

    super(coherenceModel, self).__init__()
    #Layer 1
    self.conv2DSent1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3,50), stride=1, padding=(1, 0))
    self.conv2DSent2 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3,50), stride=1, padding=(1, 0))
    self.conv2DSent3 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3,50), stride=1, padding=(1, 0))

    self.avgPool2DSent1 = nn.AdaptiveAvgPool1d(1)
    self.avgPool2DSent2 = nn.AdaptiveAvgPool1d(1)
    self.avgPool2DSent3 = nn.AdaptiveAvgPool1d(1)

    self.hiddenLayer = nn.Conv1d(in_channels = 1, out_channels = 1, kernel_size=1,stride=1)

    self.finalLayer = nn.AvgPool1d(kernel_size=48)
    self.endSigmoid = nn.Sigmoid()


  def forward(self,sentence1, sentence2, sentence3):
    
    #Layer 1
    self.output2DSent1 = self.conv2DSent1(sentence1)
    self.output2DSent2 = self.conv2DSent2(sentence2)
    self.output2DSent3 = self.conv2DSent3(sentence3)

    self.output1DSent1Squeezed = self.output2DSent1.squeeze(3)
    self.output1DSent2Squeezed = self.output2DSent2.squeeze(3)
    self.output1DSent3Squeezed = self.output2DSent3.squeeze(3)

    self.outputPoolSent1 = self.avgPool2DSent1(self.output1DSent1Squeezed)
    self.outputPoolSent2 = self.avgPool2DSent2(self.output1DSent2Squeezed)
    self.outputPoolSent3 = self.avgPool2DSent3(self.output1DSent3Squeezed)

    self.inputForJoinSent1 = self.outputPoolSent1.squeeze(2).unsqueeze(0)
    self.inputForJoinSent2 = self.outputPoolSent2.squeeze(2).unsqueeze(0)
    self.inputForJoinSent3 = self.outputPoolSent3.squeeze(2).unsqueeze(0)



    self.neuralNetInput = torch.cat((self.inputForJoinSent1, self.inputForJoinSent2, self.inputForJoinSent3),2)

    self.hiddenLayerOutput = self.hiddenLayer(self.neuralNetInput)

    self.finalOutput = self.finalLayer(self.hiddenLayerOutput)

    self.sigmoidInput = self.finalOutput.squeeze(0).squeeze(0)
    self.finalProb = self.endSigmoid(self.sigmoidInput)
    return self.finalProb

# Code to train the model and backpropogate based on the loss defined in the report.

In [None]:
"""# Training the model """


def customLoss(probTensor, outVal):
  if outVal == 1:
    logProbTensor = torch.log(probTensor)
    logSumProbTensor = torch.sum(logProbTensor)
    finalLoss = torch.mul(logSumProbTensor,-1)
    return finalLoss
  else:
    oneTensor = torch.ones(1).to(device)
    probTempTensor = torch.add(oneTensor, probTensor, alpha = -1)
    logProbTensor = torch.log(probTempTensor)
    logSumProbTensor = torch.sum(logProbTensor)
    finalLoss = torch.mul(logSumProbTensor,-1)
    return finalLoss

def train_model(model, model_name, optimizer, fileName, num_epochs=25):
    since = time.time()
    print("Start Training")
    model_dir = '/content/ckpts_'+model_name
    if not os.path.exists(model_dir):
      os.makedirs(model_dir)
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        epoch_loss = 0.0
        # curPostProb = Variable(torch.ones(1).to(device),requires_grad=True)
        # curPostProb = torch.ones(1).to(device)
        for post in fileName:
          tempPost = post[0]
          outVal = int(post[1])
          model.train()  # Set model to training mode
          inputSize = len(tempPost)
          for i in range(0,inputSize-2):
            sentence1 = tempPost[i]
            sentence2 = tempPost[i+1]
            sentence3 = tempPost[i+2]

            sentence1Tensor = torch.from_numpy(sentence1)
            sentence1Tensor = torch.unsqueeze(sentence1Tensor,0)
            sentence1Tensor = torch.unsqueeze(sentence1Tensor,0).to(device)

            sentence2Tensor = torch.from_numpy(sentence2)
            sentence2Tensor = torch.unsqueeze(sentence2Tensor,0)
            sentence2Tensor = torch.unsqueeze(sentence2Tensor,0).to(device)

            sentence3Tensor = torch.from_numpy(sentence3)
            sentence3Tensor = torch.unsqueeze(sentence3Tensor,0)
            sentence3Tensor = torch.unsqueeze(sentence3Tensor,0).to(device)
            probCur = model(sentence1Tensor,sentence2Tensor,sentence3Tensor)
            detachTensor = probCur.detach().clone()
            # curPostProb = torch.cat((curPostProb, detachTensor),0)

            # zero the parameter gradients
            optimizer.zero_grad()
            loss = customLoss(probCur, outVal)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()


        print('Epoch Loss: ', epoch_loss)
        states = {
            'model_state': model.state_dict(),
            'optim_state': optimizer.state_dict(),
            'ep_loss': epoch_loss
        }
        torch.save(states, os.path.join(model_dir, 'net_'+str(epoch)+'.pth'))
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

    return None

# Code to run test on the trained model

In [None]:
def test_model(model, model_name, index, testFile):
    since = time.time()
    print("Start Testing")
    model.eval()

    lastepoch=299
    model_dir = '/content/ckpts_'+model_name
    checkpoint = torch.load(os.path.join(model_dir,'net_'+ str(lastepoch)+".pth"))
    model.load_state_dict(checkpoint['model_state'])

    scores = []
    for post in testFile:
      tempPost = post[index]
      inputSize = len(tempPost)
      score = 1
      for i in range(0,inputSize-2):
        sentence1 = tempPost[i]
        sentence2 = tempPost[i+1]
        sentence3 = tempPost[i+2]

        sentence1Tensor = torch.from_numpy(sentence1)
        sentence1Tensor = torch.unsqueeze(sentence1Tensor,0)
        sentence1Tensor = torch.unsqueeze(sentence1Tensor,0).to(device)

        sentence2Tensor = torch.from_numpy(sentence2)
        sentence2Tensor = torch.unsqueeze(sentence2Tensor,0)
        sentence2Tensor = torch.unsqueeze(sentence2Tensor,0).to(device)

        sentence3Tensor = torch.from_numpy(sentence3)
        sentence3Tensor = torch.unsqueeze(sentence3Tensor,0)
        sentence3Tensor = torch.unsqueeze(sentence3Tensor,0).to(device)
        probCur = model(sentence1Tensor,sentence2Tensor,sentence3Tensor)
        detachTensor = probCur.detach().clone()
        score *= (probCur.item())
      scores.append(score)
    
    return scores


# Training dataset1: Social Media dataset which considers posts and comments as separate documents

In [None]:
with open("/content/drive/My Drive/nlp/separate/separate_train_embeddings.txt", "rb") as fp:   # Unpickling
  separate_train_embeddings = pickle.load(fp)

In [None]:
# if __name__ == "__main__":
model = coherenceModel()
_lr = 2e-4
optimizer = Adam(model.parameters(), lr=_lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Start")
model_ft = train_model(model, "coherenceModelSeparate", optimizer, separate_train_embeddings, num_epochs=300)
print("End")

Start
Start Training
Epoch 0/299
----------
Epoch Loss:  6082.253101885319

Epoch 1/299
----------
Epoch Loss:  5927.7052547335625

Epoch 2/299
----------
Epoch Loss:  5925.112095355988

Epoch 3/299
----------
Epoch Loss:  5923.072252601385

Epoch 4/299
----------
Epoch Loss:  5921.2936980724335

Epoch 5/299
----------
Epoch Loss:  5919.718302875757

Epoch 6/299
----------
Epoch Loss:  5918.3052043914795

Epoch 7/299
----------
Epoch Loss:  5917.02326208353

Epoch 8/299
----------
Epoch Loss:  5915.8499675393105

Epoch 9/299
----------
Epoch Loss:  5914.768180042505

Epoch 10/299
----------
Epoch Loss:  5913.764505922794

Epoch 11/299
----------
Epoch Loss:  5912.827797174454

Epoch 12/299
----------
Epoch Loss:  5911.949794650078

Epoch 13/299
----------
Epoch Loss:  5911.123561650515

Epoch 14/299
----------
Epoch Loss:  5910.343365341425

Epoch 15/299
----------
Epoch Loss:  5909.60381641984

Epoch 16/299
----------
Epoch Loss:  5908.9012697041035

Epoch 17/299
----------
Epoch Loss

# Testing model 1 on all three test datasets

In [None]:
model = coherenceModel()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

coherenceModel(
  (conv2DSent1): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (conv2DSent2): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (conv2DSent3): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (avgPool2DSent1): AdaptiveAvgPool1d(output_size=1)
  (avgPool2DSent2): AdaptiveAvgPool1d(output_size=1)
  (avgPool2DSent3): AdaptiveAvgPool1d(output_size=1)
  (hiddenLayer): Conv1d(1, 1, kernel_size=(1,), stride=(1,))
  (finalLayer): AvgPool1d(kernel_size=(48,), stride=(48,), padding=(0,))
  (endSigmoid): Sigmoid()
)

In [None]:
with open("/content/drive/My Drive/nlp/separate/separate_test_embeddings.txt", "rb") as fp:   # Unpickling
  separate_test_embeddings = pickle.load(fp)

coherentScores = test_model(model, "coherenceModelSeparate",0, separate_test_embeddings)
incoherentScores = test_model(model, "coherenceModelSeparate",1, separate_test_embeddings)
total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  71.10714285714285


In [None]:
with open("/content/drive/My Drive/nlp/combined/combined_test_embeddings.txt", "rb") as fp:   # Unpickling
  combined_test_embeddings = pickle.load(fp)
  
coherentScores = test_model(model, "coherenceModelSeparate",0, combined_test_embeddings)
incoherentScores = test_model(model, "coherenceModelSeparate",1, combined_test_embeddings)
total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  70.78571428571429


In [None]:
with open("/content/drive/My Drive/nlp/accident/datasetNTSBTesting.txt", "rb") as fp:   # Unpickling
  datasetNTSBTesting = pickle.load(fp)

coherentScores = test_model(model, "coherenceModelSeparate",0, datasetNTSBTesting)
incoherentScores = test_model(model, "coherenceModelSeparate",1, datasetNTSBTesting)

total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  64.85


# Training dataset2: Social Media dataset which considers posts and top 10 comments as combined document

In [None]:
with open("/content/drive/My Drive/nlp/combined/combined_train_embeddings.txt", "rb") as fp:   # Unpickling
  combined_train_embeddings = pickle.load(fp)

In [None]:
model = coherenceModel()
_lr = 2e-4
optimizer = Adam(model.parameters(), lr=_lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Start")
model_ft = train_model(model, "coherenceModelCombined", optimizer, combined_train_embeddings, num_epochs=300)
print("End")

Start
Start Training
Epoch 0/299
----------
Epoch Loss:  5567.785762965679

Epoch 1/299
----------
Epoch Loss:  5561.648887813091

Epoch 2/299
----------
Epoch Loss:  5561.022981405258

Epoch 3/299
----------
Epoch Loss:  5560.647276878357

Epoch 4/299
----------
Epoch Loss:  5560.299709022045

Epoch 5/299
----------
Epoch Loss:  5559.959327697754

Epoch 6/299
----------
Epoch Loss:  5559.621977746487

Epoch 7/299
----------
Epoch Loss:  5559.286205351353

Epoch 8/299
----------
Epoch Loss:  5558.951483666897

Epoch 9/299
----------
Epoch Loss:  5558.618040502071

Epoch 10/299
----------
Epoch Loss:  5558.28657323122

Epoch 11/299
----------
Epoch Loss:  5557.9582443237305

Epoch 12/299
----------
Epoch Loss:  5557.634288609028

Epoch 13/299
----------
Epoch Loss:  5557.315944075584

Epoch 14/299
----------
Epoch Loss:  5557.004436373711

Epoch 15/299
----------
Epoch Loss:  5556.700870633125

Epoch 16/299
----------
Epoch Loss:  5556.405989289284

Epoch 17/299
----------
Epoch Loss:  

# Testing model 2 on all three test datasets

In [None]:
model = coherenceModel()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

coherenceModel(
  (conv2DSent1): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (conv2DSent2): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (conv2DSent3): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (avgPool2DSent1): AdaptiveAvgPool1d(output_size=1)
  (avgPool2DSent2): AdaptiveAvgPool1d(output_size=1)
  (avgPool2DSent3): AdaptiveAvgPool1d(output_size=1)
  (hiddenLayer): Conv1d(1, 1, kernel_size=(1,), stride=(1,))
  (finalLayer): AvgPool1d(kernel_size=(48,), stride=(48,), padding=(0,))
  (endSigmoid): Sigmoid()
)

In [None]:
with open("/content/drive/My Drive/nlp/combined/combined_test_embeddings.txt", "rb") as fp:   # Unpickling
  combined_test_embeddings = pickle.load(fp)
  
coherentScores = test_model(model, "coherenceModelCombined",0, combined_test_embeddings)
incoherentScores = test_model(model, "coherenceModelCombined",1, combined_test_embeddings)
total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  77.07142857142857


In [None]:
with open("/content/drive/My Drive/nlp/separate/separate_test_embeddings.txt", "rb") as fp:   # Unpickling
  separate_test_embeddings = pickle.load(fp)

coherentScores = test_model(model, "coherenceModelCombined",0, separate_test_embeddings)
incoherentScores = test_model(model, "coherenceModelCombined",1, separate_test_embeddings)
total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  74.53571428571429


In [None]:
with open("/content/drive/My Drive/nlp/accident/datasetNTSBTesting.txt", "rb") as fp:   # Unpickling
  datasetNTSBTesting = pickle.load(fp)

coherentScores = test_model(model, "coherenceModelCombined",0, datasetNTSBTesting)
incoherentScores = test_model(model, "coherenceModelCombined",1, datasetNTSBTesting)

total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  69.95


# Training dataset3: Accident reports dataset which is a standard dataset for text coherence tasks.

In [None]:
with open("/content/drive/My Drive/nlp/accident/datasetNTSBTraining.txt", "rb") as fp:   # Unpickling
  datasetNTSBTraining = pickle.load(fp)

In [None]:
model = coherenceModel()
_lr = 2e-3
optimizer = Adam(model.parameters(), lr=_lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Start")
model_ft = train_model(model, "coherenceModelAccident", optimizer, datasetNTSBTraining, num_epochs=300)
print("End")

Start
Start Training
Epoch 0/299
----------
Epoch Loss:  1353.2238913476467

Epoch 1/299
----------
Epoch Loss:  1359.3757936060429

Epoch 2/299
----------
Epoch Loss:  1354.6053730845451

Epoch 3/299
----------
Epoch Loss:  1351.1912798285484

Epoch 4/299
----------
Epoch Loss:  1348.2134356796741

Epoch 5/299
----------
Epoch Loss:  1345.5197977870703

Epoch 6/299
----------
Epoch Loss:  1343.112528219819

Epoch 7/299
----------
Epoch Loss:  1340.989823386073

Epoch 8/299
----------
Epoch Loss:  1339.1265727132559

Epoch 9/299
----------
Epoch Loss:  1337.4860010445118

Epoch 10/299
----------
Epoch Loss:  1336.030847594142

Epoch 11/299
----------
Epoch Loss:  1334.729093492031

Epoch 12/299
----------
Epoch Loss:  1333.5550366342068

Epoch 13/299
----------
Epoch Loss:  1332.488572359085

Epoch 14/299
----------
Epoch Loss:  1331.5140054523945

Epoch 15/299
----------
Epoch Loss:  1330.6189009323716

Epoch 16/299
----------
Epoch Loss:  1329.7931881174445

Epoch 17/299
----------
E

# Testing model 3 on all three test datasets

In [None]:
model = coherenceModel()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

coherenceModel(
  (conv2DSent1): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (conv2DSent2): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (conv2DSent3): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1), padding=(1, 0))
  (avgPool2DSent1): AdaptiveAvgPool1d(output_size=1)
  (avgPool2DSent2): AdaptiveAvgPool1d(output_size=1)
  (avgPool2DSent3): AdaptiveAvgPool1d(output_size=1)
  (hiddenLayer): Conv1d(1, 1, kernel_size=(1,), stride=(1,))
  (finalLayer): AvgPool1d(kernel_size=(48,), stride=(48,), padding=(0,))
  (endSigmoid): Sigmoid()
)

In [None]:
with open("/content/drive/My Drive/nlp/accident/datasetNTSBTesting.txt", "rb") as fp:   # Unpickling
  datasetNTSBTesting = pickle.load(fp)

coherentScores = test_model(model, "coherenceModelAccident",0, datasetNTSBTesting)
incoherentScores = test_model(model, "coherenceModelAccident",1, datasetNTSBTesting)

total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  83.2


In [None]:
with open("/content/drive/My Drive/nlp/separate/separate_test_embeddings.txt", "rb") as fp:   # Unpickling
  separate_test_embeddings = pickle.load(fp)

coherentScores = test_model(model, "coherenceModelAccident",0, separate_test_embeddings)
incoherentScores = test_model(model, "coherenceModelAccident",1, separate_test_embeddings)
total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  64.03571986251


In [None]:
with open("/content/drive/My Drive/nlp/combined/combined_test_embeddings.txt", "rb") as fp:   # Unpickling
  combined_test_embeddings = pickle.load(fp)
  
coherentScores = test_model(model, "coherenceModelAccident",0, combined_test_embeddings)
incoherentScores = test_model(model, "coherenceModelAccident",1, combined_test_embeddings)
total = len(coherentScores)
correct=0
for i in range(total):
  if coherentScores[i] >= incoherentScores[i]:
    correct+=1
accuracy = (correct/total) * 100
print("Test accuracy: ", accuracy)


Start Testing
Start Testing
Test accuracy:  67.821
