In [1]:
import pandas as pd
import numpy as np
import scipy
import math
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
def load_data_train(path):
    f = open(path, "r")
    X = []
    for line in f:
        s1, s2 = line.split('\t')
        X.append((s1, s2))
    f.close()
    return X

def load_data_dev(path):
    f = open(path, "r")
    X, y = [], []
    for line in f:
        s1, s2, t = line.split('\t')
        X.append((s1, s2))
        y.append(int(t[0]))
    f.close()
    return X, y

X_train = load_data_train("hw4_data/train.tsv")
X_dev, y_dev = load_data_dev("hw4_data/dev.tsv")
X_devtest, y_devtest = load_data_dev("hw4_data/devtest.tsv")

## Preprocessing

In [3]:
import nltk
import re

STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        text = sentence.replace("‘", "'").replace("’", "'").lower()
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"\'s", " 's ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " , ", text)
        text = re.sub(r"\.\.\.", " ", text)
        text = re.sub(r"\.", " . ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r":", " : ", text)
        self.tokens = [t for t in nltk.word_tokenize(text)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

## Generate Negative Example

In [4]:
import random
from nltk.corpus import wordnet 
inverse = ["are", "is", "am", "will", "would", "can"]

def to_negative(tokens):
    for i, token in enumerate(tokens):
        if token in inverse and i != len(tokens)-1:
            if tokens[i+1] == "not" or tokens[i+1] == "n't":
                return tokens[:i+1] + tokens[i+2:]
            else:
                return tokens[:i+1] + ["not"] + tokens[i+1:]
        for syn in wordnet.synsets(token): 
            for l in syn.lemmas(): 
                if l.antonyms(): 
                    tokens[i] = random.choice(l.antonyms()).name()
                    return tokens
    return []

def generate_n_negative(train, n):
    res = []
    for s1, s2 in train:
        sen1 = Sentence(s1)
        sen2 = Sentence(s2)
        if to_negative(sen1.tokens):
            res.append((' '.join(sen1.tokens), ' '.join(sen2.tokens)))
            n -= 1
        elif to_negative(sen2.tokens):
            res.append((' '.join(sen2.tokens), ' '.join(sen1.tokens)))
            n -= 1
        if not n:
            return res

In [48]:
NUM_POSITIVE = 20000
NUM_NEGATIVE = NUM_POSITIVE // 2
NUM_DATA = 2 * NUM_POSITIVE

X = X_train[:NUM_POSITIVE] \
    + list(zip([s1 for s1, _ in X_train[NUM_POSITIVE:NUM_POSITIVE+NUM_NEGATIVE]], [s2 for _, s2 in X_train[NUM_POSITIVE+NUM_NEGATIVE:NUM_POSITIVE+2*NUM_NEGATIVE]])) \
    + generate_n_negative(X_train[NUM_POSITIVE+2*NUM_NEGATIVE:], NUM_NEGATIVE)
y = [1 for _ in range(NUM_POSITIVE)] + [0 for _ in range(NUM_POSITIVE)]

In [64]:
print(X[-NUM_NEGATIVE:])



## Pre-trained Word Embeddings

In [10]:
import gensim

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api

# PATH_TO_WORD2VEC = os.path.expanduser("data/word2vec/GoogleNews-vectors-negative300.bin")
PATH_TO_GLOVE = os.path.expanduser("../glove.840B.300d.txt")

word2vec = api.load('word2vec-google-news-300')
tmp_file = "glove.840B.300d.w2v.txt"
# glove2word2vec(W2V_PATH, tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

W1209 12:04:09.554851 4562230720 utils_any2vec.py:157] duplicate word '����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������' in word2vec file, ignoring all but first


In [11]:
import csv

PATH_TO_FREQUENCIES_FILE = "data/sentence_similarity/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "data/sentence_similarity/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

## Neural Network Model

In [6]:
import string
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [37]:
class ParaphraseIdentification(nn.Module):

    def __init__(self, num_class, embedding_dim):
        super(ParaphraseIdentification, self).__init__()
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, num_class)

    def forward(self, inputs):
        embeds = inputs.view((1, -1))
        out = F.tanh(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

def evaluate(model, X_data, y_data):
    model.eval()
    correct = all_ = 0
    for i, x in enumerate(X_data):
#         print(model(torch.Tensor(x)))
        if int(torch.argmax(model(torch.Tensor(x)))) == y_data[i]:
            correct += 1
        all_ += 1
#         result = [int(torch.argmax(model(torch.Tensor(x)))) == y for x, y in zip(X_data, y_data)]
    return correct / all_

## Sentence Embedding

### Baseline

In [21]:
from collections import Counter
import math
from sklearn import svm, metrics

def get_avg_embeddings(X, y, model=None, doc_freqs=None):
    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    X_embed, y_embed = [], []
    for i, (s1, s2) in enumerate(X):
        sent1 = Sentence(s1)
        sent2 = Sentence(s2)
    
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            continue
        
        tokfreqs1 = Counter(tokens1)
        tokfreqs2 = Counter(tokens2)
        
        weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1] if doc_freqs else None
        weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2] if doc_freqs else None
                
        embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
        embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

        X_embed.append(np.concatenate((embedding1, embedding2), axis=1)[0])
        y_embed.append(y[i])
        
    return X_embed, y_embed

In [59]:
X_embed_train, y_embed_train = get_avg_embeddings(X, y, model=word2vec, doc_freqs=doc_frequencies)
X_embed_dev, y_embed_dev = get_avg_embeddings(X_dev, y_dev, model=word2vec, doc_freqs=doc_frequencies)
X_embed_devtest, y_embed_devtest = get_avg_embeddings(X_devtest, y_devtest, model=word2vec, doc_freqs=doc_frequencies)

EMBEDDING_DIM = len(X_embed_train[0])
NUM_DATA = len(X_embed_train)

loss_function = nn.NLLLoss()
model = ParaphraseIdentification(2, EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.02)
best_acc = 0.0

In [60]:
shuffle = list(range(NUM_DATA))
random.shuffle(shuffle)
for epoch in range(30):
    model.train()
    print('training epoch %d'%(epoch))
    for i in shuffle:
        embed = torch.Tensor(X_embed_train[i])
        model.zero_grad()
#         print(embed)
        log_probs = model(embed)
#         print(log_probs)
        loss = loss_function(log_probs, torch.LongTensor([y_embed_train[i]]))
#         print(y_embed_train[i])
        loss.backward()
        optimizer.step()
        
        if i % 10000 == 0:
            model.eval()
            acc = evaluate(model, X_embed_dev, y_embed_dev)
            if acc > best_acc:
                best_acc = acc
                best_test_acc = evaluate(model, X_embed_devtest, y_embed_devtest)
    
    model.eval()
    train_acc = evaluate(model, X_embed_train, y_embed_train)
    acc = evaluate(model, X_embed_dev, y_embed_dev)
    if acc > best_acc:
        best_acc = acc
        best_test_acc = evaluate(model, X_embed_devtest, y_embed_devtest)
    print("train acc: %5.5f, best dev acc: %5.5f, best devtest acc: %5.5f"%(train_acc, best_acc, best_test_acc))
    

training epoch 0
train acc: 0.56596, best dev acc: 0.50673, best devtest acc: 0.52114
training epoch 1
train acc: 0.56355, best dev acc: 0.55324, best devtest acc: 0.54851
training epoch 2
train acc: 0.54967, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 3
train acc: 0.54045, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 4
train acc: 0.53559, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 5
train acc: 0.53208, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 6
train acc: 0.53107, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 7
train acc: 0.53045, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 8
train acc: 0.55405, best dev acc: 0.56181, best devtest acc: 0.54602
training epoch 9
train acc: 0.56977, best dev acc: 0.59731, best devtest acc: 0.59577
training epoch 10
train acc: 0.57225, best dev acc: 0.61689, best devtest acc: 0.62438
training epoch 11
train acc: 0.57661, best dev acc: 0

### Smooth Inverse Frequency

In [61]:
from sklearn.decomposition import TruncatedSVD

def remove_first_principal_component(X):   
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


import copy

def get_sif_embeddings(X, y, model=None, freqs={}, use_stoplist=False, a=0.001):
    total_freq = sum(freqs.values())
    
    embeddings = []
    y_embed = copy.deepcopy(y)
    
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    for i, (s1, s2) in enumerate(X): 
        
        sent1 = Sentence(s1)
        sent2 = Sentence(s2)
    
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            del y_embed[i]
            continue
        
        weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
        weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]
        
        embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)
        
        embeddings.append(embedding1)
        embeddings.append(embedding2)
    
    embeddings = remove_first_principal_component(np.array(embeddings))
    
    X_embed = [np.concatenate((embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0])
            for idx in range(int(len(embeddings)/2))]
    
    return X_embed, y_embed

In [62]:
X_embed_train, y_embed_train = get_sif_embeddings(X, y, model=word2vec, freqs=frequencies)
    
X_embed_dev, y_embed_dev = get_sif_embeddings(X_dev, y_dev, model=word2vec, freqs=frequencies)
X_embed_devtest, y_embed_devtest = get_sif_embeddings(X_devtest, y_devtest, model=word2vec, freqs=frequencies)

EMBEDDING_DIM = len(X_embed_train[0])
NUM_DATA = len(X_embed_train)

loss_function = nn.NLLLoss()
model = ParaphraseIdentification(2, EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.02)
best_acc = 0.0

In [63]:
shuffle = list(range(NUM_DATA))
random.shuffle(shuffle)
for epoch in range(30):
    model.train()
    print('training epoch %d'%(epoch))
    for i in shuffle:
        embed = torch.Tensor(X_embed_train[i])
        model.zero_grad()
#         print(embed)
        log_probs = model(embed)
#         print(log_probs)
        loss = loss_function(log_probs, torch.LongTensor([y_embed_train[i]]))
#         print(y_embed_train[i])
        loss.backward()
        optimizer.step()
        
        if i % 10000 == 0:
            model.eval()
            acc = evaluate(model, X_embed_dev, y_embed_dev)
            if acc > best_acc:
                best_acc = acc
                best_test_acc = evaluate(model, X_embed_devtest, y_embed_devtest)
    
    model.eval()
    train_acc = evaluate(model, X_embed_train, y_embed_train)
    acc = evaluate(model, X_embed_dev, y_embed_dev)
    if acc > best_acc:
        best_acc = acc
        best_test_acc = evaluate(model, X_embed_devtest, y_embed_devtest)
    print("train acc: %5.5f, best dev acc: %5.5f, best devtest acc: %5.5f"%(train_acc, best_acc, best_test_acc))
    

training epoch 0




train acc: 0.53741, best dev acc: 0.53856, best devtest acc: 0.53109
training epoch 1
train acc: 0.54396, best dev acc: 0.53856, best devtest acc: 0.53109
training epoch 2
train acc: 0.54591, best dev acc: 0.53856, best devtest acc: 0.53109
training epoch 3
train acc: 0.54641, best dev acc: 0.53856, best devtest acc: 0.53109
training epoch 4
train acc: 0.54651, best dev acc: 0.53856, best devtest acc: 0.53109
training epoch 5
train acc: 0.54691, best dev acc: 0.53856, best devtest acc: 0.53109
training epoch 6
train acc: 0.54781, best dev acc: 0.53978, best devtest acc: 0.53358
training epoch 7
train acc: 0.54852, best dev acc: 0.53978, best devtest acc: 0.53358
training epoch 8
train acc: 0.54844, best dev acc: 0.53978, best devtest acc: 0.53358
training epoch 9
train acc: 0.54804, best dev acc: 0.53978, best devtest acc: 0.53358
training epoch 10
train acc: 0.54786, best dev acc: 0.53978, best devtest acc: 0.53358
training epoch 11
train acc: 0.54802, best dev acc: 0.53978, best devt

### InferSent

In [54]:
from models import InferSent

model_version = 1
MODEL_PATH = "../InferSent-master/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = '../glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [55]:
def get_inf_embeddings(X, y):
    
    raw_sentences1 = [s1 for s1, _ in X]
    raw_sentences2 = [s2 for _, s2 in X]
    
    infersent.build_vocab(raw_sentences1 + raw_sentences2, tokenize=True)
    embeddings1 = infersent.encode(raw_sentences1, tokenize=True)
    embeddings2 = infersent.encode(raw_sentences2, tokenize=True)
    
    X_embed = []
    for (emb1, emb2) in zip(embeddings1, embeddings2): 
        X_embed.append(np.concatenate((emb1, emb2), axis=0))
        
    return X_embed, y

In [56]:
X_embed_train, y_embed_train = get_inf_embeddings(X, y)
    
X_embed_dev, y_embed_dev = get_inf_embeddings(X_dev, y_dev)
X_embed_devtest, y_embed_devtest = get_inf_embeddings(X_devtest, y_devtest)

EMBEDDING_DIM = len(X_embed_train[0])
NUM_DATA = len(X_embed_train)

loss_function = nn.NLLLoss()
model = ParaphraseIdentification(2, EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.02)
best_acc = 0.0

Found 38715(/44244) words with w2v vectors
Vocab size : 38715
Found 2908(/3000) words with w2v vectors
Vocab size : 2908
Found 2910(/3005) words with w2v vectors
Vocab size : 2910


In [58]:
shuffle = list(range(NUM_DATA))
random.shuffle(shuffle)
for epoch in range(30):
    model.train()
    print('training epoch %d'%(epoch))
    for i in shuffle:
        embed = torch.Tensor(X_embed_train[i])
        model.zero_grad()
#         print(embed)
        log_probs = model(embed)
#         print(log_probs)
        loss = loss_function(log_probs, torch.LongTensor([y_embed_train[i]]))
#         print(y_embed_train[i])
        loss.backward()
        optimizer.step()
        
        if i % 10000 == 0:
            model.eval()
            acc = evaluate(model, X_embed_dev, y_embed_dev)
            if acc > best_acc:
                best_acc = acc
                best_test_acc = evaluate(model, X_embed_devtest, y_embed_devtest)
    
    model.eval()
    train_acc = evaluate(model, X_embed_train, y_embed_train)
    acc = evaluate(model, X_embed_dev, y_embed_dev)
    if acc > best_acc:
        best_acc = acc
        best_test_acc = evaluate(model, X_embed_devtest, y_embed_devtest)
    print("train acc: %5.5f, best dev acc: %5.5f, best devtest acc: %5.5f"%(train_acc, best_acc, best_test_acc))
    

training epoch 0
train acc: 0.96145, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 1
train acc: 0.97270, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 2
train acc: 0.97940, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 3
train acc: 0.98137, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 4
train acc: 0.98118, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 5
train acc: 0.98295, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 6
train acc: 0.98620, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 7
train acc: 0.98680, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 8
train acc: 0.98693, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 9
train acc: 0.98652, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 10
train acc: 0.98760, best dev acc: 0.77384, best devtest acc: 0.76894
training epoch 11
train acc: 0.98933, best dev acc: 0