In [1]:
import pandas as pd
import numpy as np
import scipy
import math
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
def load_data_train(path):
    f = open(path, "r")
    X = []
    for line in f:
        s1, s2 = line.split('\t')
        X.append((s1, s2))
    f.close()
    return X

def load_data_dev(path):
    f = open(path, "r")
    X, y = [], []
    for line in f:
        s1, s2, t = line.split('\t')
        X.append((s1, s2))
        y.append(int(t[0]))
    f.close()
    return X, y

X_train = load_data_train("hw4_data/train.tsv")
X_dev, y_dev = load_data_dev("hw4_data/dev.tsv")
X_devtest, y_devtest = load_data_dev("hw4_data/devtest.tsv")

In [3]:
X_dev[50:]

[('What, what happened?', 'what, what, what...?'),
 ('This man is a crook.', 'it means the man is not a man.'),
 ('Do you wanna know what I would write?', 'you wanna die too?'),
 ('A bear.', 'bear.'),
 ('What do you think Neal wants?', 'do you think what I think you think?'),
 ("What's the matter with me, Nancy?", "what's the matter with you?"),
 ('"Do we have any idea what that big freighter is doing riding in orbit so close to the space station?"',
  '"do we have any idea what makes a large cargo ship in orbit so close to the space station?"'),
 ("So I see. There's a great band at El Chico, in the Village.",
  "there's a great band in the Village, in El Chico."),
 ('Already cuffed! about another mile on this road, turn left on Cabrillo.',
  'one more mile on this road and turn left on Cabrillo.'),
 ('Hummin smiled slightly.', 'hummin smiled weakly.'),
 ('Why do women always do that?', 'why do women keep doing this?'),
 ("I'm glad you're here to tell us these things.", "I'm glad to ha

[('What, what happened?', 'what, what, what...?'),
 ('This man is a crook.', 'it means the man is not a man.'),
 ('Do you wanna know what I would write?', 'you wanna die too?'),
 ('A bear.', 'bear.'),
 ('What do you think Neal wants?', 'do you think what I think you think?'),
 ("What's the matter with me, Nancy?", "what's the matter with you?"),
 ('"Do we have any idea what that big freighter is doing riding in orbit so close to the space station?"',
  '"do we have any idea what makes a large cargo ship in orbit so close to the space station?"'),
 ("So I see. There's a great band at El Chico, in the Village.",
  "there's a great band in the Village, in El Chico."),
 ('Already cuffed! about another mile on this road, turn left on Cabrillo.',
  'one more mile on this road and turn left on Cabrillo.'),
 ('Hummin smiled slightly.', 'hummin smiled weakly.'),
 ('Why do women always do that?', 'why do women keep doing this?'),
 ("I'm glad you're here to tell us these things.", "I'm glad to ha

## Preprocessing

In [4]:
import nltk
import re

STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        text = sentence.replace("‘", "'").replace("’", "'").lower()
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"\'s", " 's ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " , ", text)
        text = re.sub(r"\.\.\.", " ", text)
        text = re.sub(r"\.", " . ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r":", " : ", text)
        self.tokens = [t for t in nltk.word_tokenize(text)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

## Generate Negative Example

In [5]:
import random
from nltk.corpus import wordnet 
inverse = ["are", "is", "am", "will", "would", "can"]

def to_negative(tokens):
    for i, token in enumerate(tokens):
        if token in inverse and i != len(tokens)-1:
            if tokens[i+1] == "not" or tokens[i+1] == "n't":
                return tokens[:i+1] + tokens[i+2:]
            else:
                return tokens[:i+1] + ["not"] + tokens[i+1:]
        for syn in wordnet.synsets(token): 
            for l in syn.lemmas(): 
                if l.antonyms(): 
                    tokens[i] = random.choice(l.antonyms()).name()
                    return tokens
    return []

def generate_n_negative(train, n):
    res = []
    for s1, s2 in train:
        sen1 = Sentence(s1)
        sen2 = Sentence(s2)
        if to_negative(sen1.tokens):
            res.append((' '.join(sen1.tokens), ' '.join(sen2.tokens)))
            n -= 1
        elif to_negative(sen2.tokens):
            res.append((' '.join(sen2.tokens), ' '.join(sen1.tokens)))
            n -= 1
        if not n:
            return res

In [6]:
NUM_POSITIVE = 20000
NUM_NEGATIVE = NUM_POSITIVE // 2
NUM_DATA = 2 * NUM_POSITIVE

X = X_train[:NUM_POSITIVE] \
    + list(zip([s1 for s1, _ in X_train[NUM_POSITIVE:NUM_POSITIVE+NUM_NEGATIVE]], [s2 for _, s2 in X_train[NUM_POSITIVE+NUM_NEGATIVE:NUM_POSITIVE+2*NUM_NEGATIVE]])) \
    + generate_n_negative(X_train[NUM_POSITIVE+2*NUM_NEGATIVE:], NUM_NEGATIVE)
y = [1 for _ in range(NUM_POSITIVE)] + [0 for _ in range(NUM_POSITIVE)]

## Pre-trained Word-Embeddings

In [7]:
import gensim

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api

# PATH_TO_WORD2VEC = os.path.expanduser("data/word2vec/GoogleNews-vectors-negative300.bin")
PATH_TO_GLOVE = os.path.expanduser("../glove.840B.300d.txt")

word2vec = api.load('word2vec-google-news-300')
tmp_file = "glove.840B.300d.w2v.txt"
# glove2word2vec(W2V_PATH, tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

W1209 04:17:42.074544 4489231808 utils_any2vec.py:157] duplicate word '����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������' in word2vec file, ignoring all but first
W1209 04:17:42.074544 4489231808 utils_any2vec.py:157] duplicate word '����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������' in word2vec file, ignoring all but first


In [8]:
import csv

PATH_TO_FREQUENCIES_FILE = "data/sentence_similarity/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "data/sentence_similarity/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

## Sentence-Embedding Methods

In [9]:
from sklearn import svm

def run_experiment(X, y, embed_func, model=None, kernel=None, doc_freqs=None, freqs=None, gamma='auto'): 
     
    X_embed_train, y_embed_train = embed_func(X, y)
    
    X_embed_dev, y_embed_dev = embed_func(X_dev, y_dev)
    X_embed_devtest, y_embed_devtest = embed_func(X_devtest, y_devtest)
    
    gammas = [0.1, 0.01, 0.001, 0.0001]
    for g in gammas:
        classifier = svm.SVC(kernel=kernel, gamma=g)
        classifier.fit(X_embed_train, y_embed_train)
        print((classifier.score(X_embed_dev, y_embed_dev), classifier.score(X_embed_devtest, y_embed_devtest)))
    
#     return classifier.score(X_embed_dev, y_embed_dev), classifier.score(X_embed_devtest, y_embed_devtest)

### Baseline 

In [10]:
from collections import Counter
import math
from sklearn import svm, metrics

def get_avg_embeddings(X, y, model=None, doc_freqs=None):
    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    X_embed, y_embed = [], []
    for i, (s1, s2) in enumerate(X):
        sent1 = Sentence(s1)
        sent2 = Sentence(s2)
    
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            continue
        
        tokfreqs1 = Counter(tokens1)
        tokfreqs2 = Counter(tokens2)
        
        weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1] if doc_freqs else None
        weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2] if doc_freqs else None
                
        embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
        embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

        X_embed.append(np.concatenate((embedding1, embedding2), axis=1)[0])
        y_embed.append(y[i])
        
    return X_embed, y_embed

In [11]:
X_embed_train, y_embed_train = get_avg_embeddings(X, y, model=word2vec, doc_freqs=doc_frequencies)
    
X_embed_dev, y_embed_dev = get_avg_embeddings(X_dev, y_dev, model=word2vec, doc_freqs=doc_frequencies)
X_embed_devtest, y_embed_devtest = get_avg_embeddings(X_devtest, y_devtest, model=word2vec, doc_freqs=doc_frequencies)

gammas = [0.1, 0.01, 0.001, 0.0001]
for g in gammas:
    classifier = svm.SVC(kernel='rbf', gamma=g)
    classifier.fit(X_embed_train, y_embed_train)
    print((classifier.score(X_embed_dev, y_embed_dev), classifier.score(X_embed_devtest, y_embed_devtest)))

(0.6842105263157895, 0.681592039800995)
(0.6842105263157895, 0.681592039800995)
(0.583843329253366, 0.5746268656716418)
(0.583843329253366, 0.5746268656716418)
(0.587515299877601, 0.599502487562189)
(0.587515299877601, 0.599502487562189)
(0.3684210526315789, 0.3681592039800995)
(0.3684210526315789, 0.3681592039800995)


### Smooth Inverse Frequency

In [12]:
from sklearn.decomposition import TruncatedSVD

def remove_first_principal_component(X):   
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


import copy

def get_sif_embeddings(X, y, model=None, freqs={}, use_stoplist=False, a=0.001):
    total_freq = sum(freqs.values())
    
    embeddings = []
    y_embed = copy.deepcopy(y)
    
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    for i, (s1, s2) in enumerate(X): 
        
        sent1 = Sentence(s1)
        sent2 = Sentence(s2)
    
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            del y_embed[i]
            continue
        
        weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
        weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]
        
        embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)
        
        embeddings.append(embedding1)
        embeddings.append(embedding2)
    
    embeddings = remove_first_principal_component(np.array(embeddings))
    
    X_embed = [np.concatenate((embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0])
            for idx in range(int(len(embeddings)/2))]
    
    return X_embed, y_embed

In [13]:
X_embed_train, y_embed_train = get_sif_embeddings(X, y, model=word2vec, freqs=frequencies)
    
X_embed_dev, y_embed_dev = get_sif_embeddings(X_dev, y_dev, model=word2vec, freqs=frequencies)
X_embed_devtest, y_embed_devtest = get_sif_embeddings(X_devtest, y_devtest, model=word2vec, freqs=frequencies)

gammas = [0.1, 0.01, 0.001, 0.0001]
for g in gammas:
    classifier = svm.SVC(kernel='rbf', gamma=g)
    classifier.fit(X_embed_train, y_embed_train)
    print((classifier.score(X_embed_dev, y_embed_dev), classifier.score(X_embed_devtest, y_embed_devtest)))

(0.5324357405140759, 0.5783582089552238)
(0.5324357405140759, 0.5783582089552238)
(0.5348837209302325, 0.5360696517412935)
(0.5348837209302325, 0.5360696517412935)
(0.572827417380661, 0.5621890547263682)
(0.572827417380661, 0.5621890547263682)
(0.3684210526315789, 0.3681592039800995)
(0.3684210526315789, 0.3681592039800995)


### InferSent

In [14]:
import torch
from models import InferSent

model_version = 1
MODEL_PATH = "../InferSent-master/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = '../glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [15]:
def get_inf_embeddings(X, y):
    
    raw_sentences1 = [s1 for s1, _ in X]
    raw_sentences2 = [s2 for _, s2 in X]
    
    infersent.build_vocab(raw_sentences1 + raw_sentences2, tokenize=True)
    embeddings1 = infersent.encode(raw_sentences1, tokenize=True)
    embeddings2 = infersent.encode(raw_sentences2, tokenize=True)
    
    X_embed = []
    for (emb1, emb2) in zip(embeddings1, embeddings2): 
        X_embed.append(np.concatenate((emb1, emb2), axis=0))
        
    return X_embed, y

In [None]:
X_embed_train, y_embed_train = get_inf_embeddings(X, y)
    
X_embed_dev, y_embed_dev = get_inf_embeddings(X_dev, y_dev)
X_embed_devtest, y_embed_devtest = get_inf_embeddings(X_devtest, y_devtest)

gammas = [0.1, 0.01, 0.001, 0.0001]
for g in gammas:
    classifier = svm.SVC(kernel='rbf', gamma=g)
    classifier.fit(X_embed_train, y_embed_train)
    print((classifier.score(X_embed_dev, y_embed_dev), classifier.score(X_embed_devtest, y_embed_devtest)))

Found 38715(/44244) words with w2v vectors
Vocab size : 38715
Found 38715(/44244) words with w2v vectors
Vocab size : 38715
Found 2908(/3000) words with w2v vectors
Vocab size : 2908
Found 2908(/3000) words with w2v vectors
Vocab size : 2908
Found 2910(/3005) words with w2v vectors
Vocab size : 2910
Found 2910(/3005) words with w2v vectors
Vocab size : 2910
(0.7359413202933985, 0.7453416149068323)
(0.7359413202933985, 0.7453416149068323)
(0.7029339853300733, 0.7080745341614907)
(0.7029339853300733, 0.7080745341614907)
