# NLP classification task (human vs machine translation)

In [250]:
import io
import os

with io.open("test.txt", encoding='utf8') as real_file:
  test_raw = real_file.read().split("\n")
with io.open("train.txt", encoding='utf8') as real_file:
  train_raw = real_file.read().split("\n")

In [None]:
import nltk
nltk.download('punkt')

In [31]:
from nltk import word_tokenize
def extract_data(data):
    targets = []
    refs = []
    cands = []
    scores = []
    labels = []
    for i in range(int(len(data)/6)):
      shift = i*6
      targets.append(data[shift])
      refs.append(data[shift+1])
      cands.append(data[shift+2])
      scores.append([float(data[shift+3])])
      labels.append(0 if data[shift+4] == 'M' else 1) #0 machine/ 1 human
      
    return targets, refs, cands, scores, labels

In [252]:
targets, refs, cands, scores, labels = extract_data(train_raw)
targets_test, refs_test, cands_test, scores_test, labels_test = extract_data(test_raw)

In [38]:
def f1_metric(matching, predicted, gold):
  precision = matching/predicted
  recall = matching/gold
  f1 = 2 * precision * recall / (precision + recall)
  return f1
  
def f1_metrics(pred, gold):
  matching_1 = 0
  matching_0 = 0
  n_1 = sum(pred)
  n_0 = len(pred) - n_1
  
  for i in range(len(pred)):
    if pred[i]==gold[i]:
      if pred[i] == 1:
        matching_1 += 1
      else:
        matching_0 += 1
  
  h_f1 = f1_metric(matching_1, n_1, len(gold))
  m_f1 = f1_metric(matching_0, n_0, len(gold))
  return h_f1, m_f1


# Logistic Regression Model


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.model_selection import train_test_split
import numpy as np

In [16]:
#Extract n-grams
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 4))
tfid_features = vectorizer.fit_transform(cands)

#Combine features
X = hstack([tfid_features,scores]).toarray()
Y = labels

In [17]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, random_state=42, test_size=0.2)

In [18]:
from sklearn.linear_model import LogisticRegression

#Initialize model
clf = LogisticRegression(penalty = 'l2', solver='liblinear', C=100)

#Train and validate
clf.fit(X_train, Y_train)
acc = clf.score(X_val, Y_val)
print(f"val accuracy: {acc}")

val accuracy: 0.7863247863247863


In [19]:
#TEST
tfid_features_test = vectorizer.transform(cands_test)
X_test = hstack([tfid_features_test,coo_matrix(scores_test)]).toarray()
Y_test = labels_test
acc = clf.score(X_test, Y_test)
print(f"train accuracy: {acc}")

train accuracy: 0.7528735632183908


In [39]:
#f1 metrics
predictions = clf.predict(X_test)
h_f1, m_f1 = f1_metrics(predictions, Y_test)
print(f"human f1: {h_f1}")
print(f"machine f1: {m_f1}")

human f1: 0.5136186770428016
machine f1: 0.4905660377358491


# LSTM Model

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove*.zip

In [382]:
import torch

#load glove weights and dictionary
dim = 100
words = []
idx = 0
word2idx = {}
vectors = []

with open(f'glove.6B.{dim}d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

#Handle unknown words by adding an 'UKN' token.
unk_token = 'UKN'
unk_idx = idx
words.append(unk_token)
word2idx[unk_token] = unk_idx
vectors.append(np.random.normal(scale=0.6, size=(dim, )))

#create the weight matrix and glove dictionary
glove = {w: vectors[word2idx[w]] for w in words}
weights = torch.tensor(vectors, dtype=torch.float)

In [383]:
#helper functions for preprocessing
from torch import nn
from torch.nn.utils.rnn import pad_sequence
import spacy

nlp = spacy.load('en_core_web_sm')

# For padding the input data to same length
def pad_collate(batch):
  (refs, cands, scores, labels) = zip(*batch)
  refs_lens = [len(x) for x in refs]
  cands_lens = [len(x) for x in cands]

  refs_pad = pad_sequence(refs, batch_first=True, padding_value=0)
  cands_pad = pad_sequence(cands, batch_first=True, padding_value=0)
  return refs_pad,  refs_lens, cands_pad, cands_lens, torch.LongTensor(scores), torch.LongTensor(labels)

def get_index(word):
  if word in word2idx:
    return word2idx[word]
  else:
    return word2idx[unk_token]

def tokenize(text):
  doc = nlp(text.lower())
  tokens = [get_index(token.text) for token in doc]
  return tokens


In [384]:
from torch.utils.data import Dataset, DataLoader

class DatasetRnn(Dataset):
    def __init__(self, refs, candidates, scores, labels):
      self.refs = refs
      self.candidates = candidates
      self.scores = torch.LongTensor(scores)
      self.labels = torch.LongTensor(labels)
    
    def __len__(self):
      return len(self.labels)
    
    def __getitem__(self, index):
      cand = tokenize(self.candidates[index])
      ref = tokenize(self.refs[index])
      return torch.tensor(ref), torch.tensor(cand), self.scores[index], self.labels[index]

In [402]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

#submodule to output sentence encoding using Glove + LSTM
class SeqEmbedding(nn.Module):
  def __init__(self, embedding, word_vec_dim, h):
    super(SeqEmbedding, self).__init__()
    self.embedding = embedding
    self.word_vec_dim = word_vec_dim
    self.rnn = nn.LSTM(word_vec_dim, h, batch_first=True)
  
  def forward(self, inputs, original_lengths):
    # Embed using word embeddings and form a PackedSeq object
    embeds = self.embedding(inputs)
    packed_inputs = pack_padded_sequence(embeds, original_lengths, batch_first=True, enforce_sorted=False)
		
    # Passing in the input and hidden state into the model and obtaining outputs
    out_packed, _ = self.rnn(packed_inputs)
    out_padded, out_lengths = pad_packed_sequence(out_packed, batch_first=True)
    
    # Get the hidden layer for the final word
    last_h = []
    for i in range(len(out_lengths)):
       last_h.append(out_padded[i][out_lengths[i]-1].unsqueeze(0))
    last_h = torch.cat(last_h)

    return last_h

class Model(nn.Module):
  def __init__(self, weights, h=64): 
    super(Model, self).__init__()
    vocab_size, word_vec_dim = weights.shape
    self.embedding = nn.Embedding.from_pretrained(weights, freeze=True)
    self.candEmbedding = SeqEmbedding(self.embedding, word_vec_dim, h)
    self.refEmbedding = SeqEmbedding(self.embedding, word_vec_dim, h)
    self.ffnn1 = nn.Linear(h * 2 + 1, 2) #feature vector = [ref_emb, cand_emb, score]
    self.loss = nn.CrossEntropyLoss()
  
  def compute_loss(self, predicted_vector, gold_label):
    return self.loss(predicted_vector, gold_label)	
  
  def forward(self, refs, refs_lengths, cands, cand_lengths, scores):
    cand_emb = self.candEmbedding(cands, cand_lengths)
    refs_emb = self.refEmbedding(refs, refs_lengths)
    x1 = torch.cat((cand_emb, refs_emb, scores.unsqueeze(dim=1)), dim=1)
    x2 = self.ffnn1(x1)
    return x2

In [403]:
train_dataset = DatasetRnn(refs, cands, scores, labels)
train_set, val_set = torch.utils.data.random_split(train_dataset, [460, 124])
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_set, batch_size=16, collate_fn=pad_collate)
model = Model(weights, h = 64)

In [404]:
import torch.optim as optim
from tqdm.notebook import tqdm, trange
import torch, gc

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def train_epoch(model, train_loader, optimizer):
  model.to(device)
  model.train()
  total_loss = 0
  for (refs, refs_lengths, cands, cand_lengths, scores, y) in tqdm(train_loader, leave=False, desc="Training"):
    output = model(refs.to(device), refs_lengths, cands.to(device), cand_lengths, scores.to(device))
    loss = model.compute_loss(output, y.to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss = loss.item()
  print(f'train loss: {total_loss}')

def validate(model, val_loader):
	model.eval()
	correct = 0
	total = 0
	for (refs, refs_lengths, cands, cand_lengths, scores, y) in tqdm(val_loader, leave=False, desc="Validation Batches"):
		output = model(refs.to(device), refs_lengths, cands.to(device), cand_lengths, scores.to(device))
		total += output.size()[0]
		_, predicted = torch.max(output, 1)
		correct += (y.to("cpu") == predicted.to("cpu")).cpu().numpy().sum()
	acc = correct/total
	print(f"validation accuracy: {acc}")

def train(number_of_epochs, model, train_loader, val_loader):
  optimizer = optim.Adam(model.parameters(), lr=5e-4)
  for epoch in trange(number_of_epochs, desc="Epochs"):
    train_epoch(model, train_loader, optimizer)
    validate(model, val_loader)

In [None]:
train(5, model, train_loader, val_loader)

In [391]:
test_dataset = DatasetRnn(refs_test, cands_test, scores_test, labels_test)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=pad_collate)

In [399]:
def predict(model, test_loader):
  model.to(device)
  predictions = []
  truth = []
  correct = 0
  for (refs, refs_lengths, cands, cand_lengths, scores, y) in tqdm(test_loader, leave=False, desc="Training"):
    output = model(refs.to(device), refs_lengths, cands.to(device), cand_lengths, scores.to(device))
    _, predicted = torch.max(output, 1)
    predictions += predicted.tolist()
    truth += y.tolist()
    correct += (y.to("cpu") == predicted.to("cpu")).cpu().numpy().sum()
  acc = correct/len(truth)
  print(f"test accuracy: {acc}")
  return predictions, truth

In [400]:
predictions, truth = predict(model, test_loader)

HBox(children=(FloatProgress(value=0.0, description='Training', max=11.0, style=ProgressStyle(description_widt…

test accuracy: 0.6436781609195402


In [401]:
h_f1, m_f1 = f1_metrics(predictions, truth)
print(f"human f1: {h_f1}")
print(f"machine f1: {m_f1}")

human f1: 0.4285714285714286
machine f1: 0.4296296296296296
