In [51]:
import os


def get_files(directory):
    if not os.path.exists(directory):
        raise RuntimeError(f"{directory} does not exist.")

    files = []
    for (dirpath, dirnames, filenames) in os.walk(directory):
        files.extend(filenames)
    return files

get_files("../data/inference/stsb1/train")

['00-readme.txt',
 'correlation.pl',
 'STS.gs.MSRpar.txt',
 'STS.gs.MSRvid.txt',
 'STS.gs.SMTeuroparl.txt',
 'STS.input.MSRpar.txt',
 'STS.input.MSRvid.txt',
 'STS.input.SMTeuroparl.txt',
 'STS.output.MSRpar.txt']

In [83]:
import pandas as pd
import re
import sys


def load_sts_subset(input_fname, gs_fname, directory):
    # extracts file id with re.findall
    sts_suffix_re = re.compile(r"STS\.(?:gs|input)\.((?:\w{1,}\.)?\w{1,})\.txt")
    input_suffix = sts_suffix_re.findall(input_fname)[0]
    gs_suffix = sts_suffix_re.findall(gs_fname)[0]

    assert input_suffix == gs_suffix, "Suffixes don't match, logic wrong"

    sentences = pd.read_csv(
        os.path.join(directory, input_fname),
        sep = "\\t",
        names = ["sent1", "sent2"],
        engine = "python")

    gold_standard = pd.read_csv(
        os.path.join(directory, gs_fname),
        names = ["score"])

    concat_frame = pd.concat([sentences, gold_standard], axis=1)
    concat_frame["data_suffix"] = input_suffix

    return concat_frame


def load_sts_from_dir(data_dir):
    dir_files = get_files(data_dir)
    sts_file_regex = re.compile(r"STS\.input\.(\w{1,}\.)?(\w{1,})\.txt")

    # files that match the sts input data format
    valid_sts_files = [
        fname for fname in dir_files
        if bool(sts_file_regex.search(fname))
    ]

    dir_files = set(dir_files)
    datasets = []
    for sts_fname in valid_sts_files:
        gs_fname = re.sub(r"(?<=STS\.)input", "gs", sts_fname)

        # skip files that don't have gold standard
        if gs_fname not in dir_files:
            print(f"skipping: {sts_fname}, cant find GS equivalent")
            continue

        datasets.append(load_sts_subset(sts_fname, gs_fname, data_dir))

    return pd.concat(datasets)


def load_stsb_data(base_dir="../data/inference/stsb1"):
    tr_sentences = load_sts_from_dir(f"{base_dir}/train")
    tst_sentences = load_sts_from_dir(f"{base_dir}/test-gold")

    return tr_sentences, tst_sentences

train_sentences, test_sentences = load_stsb_data()

train_sentences

Unnamed: 0,sent1,sent2,score,data_suffix
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,4.00,MSRpar
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,3.75,MSRpar
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...",2.80,MSRpar
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent...",3.40,MSRpar
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,2.40,MSRpar
...,...,...,...,...
729,"Action is needed quickly, which is why we deci...",It is urgent and that is why we have decided t...,5.00,SMTeuroparl
730,One could indeed wish for more and for improve...,"We can actually want more and better, but I th...",4.80,SMTeuroparl
731,(Parliament accepted the oral amendment),(Parliament accepted the oral amendment),5.00,SMTeuroparl
732,- My party has serious reservations about Comm...,My party serious reservations about the regula...,4.80,SMTeuroparl


In [95]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords


def tokenize_documents(sentences):
    return [sentence.strip().lower().split() for sentence in sentences]


def train_doc2vec(sentences, seed=0XDECAF, **doc2vec_kwargs):
    stoplist = set(stopwords.words("english"))
    tokenized = tokenize_documents(sentences)
    train_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized)]

    return (
        Doc2Vec(
            train_docs,
            seed=seed,
            **doc2vec_kwargs
        ))

train_documents = np.concatenate((train_sentences.sent1.values, train_sentences.sent2.values))
doc2vec_model = train_doc2vec(
    train_documents,
    vector_size=100,
    window=2,
    min_count=1,
    workers=4,
    epochs=200)

doc2vec_model

<gensim.models.doc2vec.Doc2Vec at 0x2d98296d490>

In [96]:
cos_sim = lambda a, b: dot(a, b)/(norm(a)*norm(b))

def infer_doc2vec(d2v_model, sentences):
    tokenized_documents = tokenize_documents(sentences)
    return [
        d2v_model.infer_vector(document_words)
        for document_words in tokenized_documents
    ]

sent1_embeddings = infer_doc2vec(doc2vec_model, train_sentences.sent1.values)
sent2_embeddings = infer_doc2vec(doc2vec_model, train_sentences.sent2.values)

print(sent1_embeddings[0])
print(sent2_embeddings[0])
print(cos_sim(sent1_embeddings[0], sent2_embeddings[0]))

[-0.27336246  0.01929293 -0.43676952  0.6697727  -0.677531    1.1908083
  0.31147757 -0.6999784  -0.43487403 -0.38292193  0.5500924  -0.26284248
  0.40142235  0.01385904 -0.17844252 -1.4803333   0.5652604  -0.41115785
 -1.1323328  -0.17272663  0.22103062 -0.5254174   0.02997912 -0.6717148
  0.12126261 -0.5301277   0.31409562  0.26459464 -0.6107148  -0.11397837
 -0.02911393 -0.47717014  0.02739717  0.0538495   1.2027074  -0.14654395
 -0.19840205 -0.5300882  -0.12192903  0.32900253  0.64409846 -0.16664167
 -0.8011557  -0.355054   -0.4767804   0.7084554  -1.1959565   0.7588401
 -0.31184205 -0.5281979  -0.44490167  0.04867037  0.75815904 -0.43769515
 -0.31825233 -0.49941498 -0.10314978 -0.49450213  0.33938453  0.03741687
  0.19376202  0.13575885 -0.9750473  -1.0288007  -0.6467795  -0.702557
  0.1861674   0.4833676  -0.7172501   0.14684117  0.24359186  0.24011372
 -0.5823613   0.7358891   0.4841842  -0.58848137 -1.1988727  -0.15141842
 -0.43829936  0.25514224 -0.39504713 -1.0776379   0.4051

In [97]:
from numpy import dot, ndarray
from numpy.linalg import norm


def compare_embeddings(embedding_one, embedding_two):
    emb_one_is_list = isinstance(embedding_one[0], (list, ndarray))
    emb_two_is_list = isinstance(embedding_two[0], (list, ndarray))

    # pairwise compare all embeddings
    if emb_one_is_list and emb_two_is_list:
        if len(embedding_one) != len(embedding_two):
            raise RuntimeError(
                "embedding one and embedding two lists not of equal length"
            )
        return np.array([
            cos_sim(e1, e2)
            for e1, e2 in zip(embedding_one, embedding_two)
        ])
    # embedding one compared against all of embedding two
    elif emb_two_is_list:
        return [cos_sim(embedding_one, e2) for e2 in embedding_two]
    # compare embedding one against embedding two
    elif not emb_one_is_list:
        return cos_sim(embedding_one, embedding_two)

    # embedding one is a list of embeddings but embedding two is one embedding
    raise RuntimeError(
        "Either Embedding one and two are lists of embeddings, else only embedding two."
    )


def rescale_numeric(cos_dist, cur_min=-1, cur_max=1, new_min=0, new_max=5):
    # percent of measurement on current scale
    cur_perc = (cos_dist - cur_min) / (cur_max - cur_min)

    # for scaling the measurement to the new range
    scaling_fct = (new_max - new_min) + new_min
    return cur_perc * scaling_fct


def get_embedding_distances(s1_embeddings, s2_embeddings, scale=True):
    distances = compare_embeddings(s1_embeddings, s2_embeddings)

    # return cosine similarity unscaled
    if not scale:
        return distances

    return rescale_numeric(distances)


embedding_scores = get_embedding_distances(sent1_embeddings, sent2_embeddings, scale=True)
embedding_scores[:20]

array([4.2903495, 3.9164648, 4.30869  , 4.4133883, 3.5119689, 4.0685463,
       4.149051 , 4.253704 , 4.111351 , 4.250328 , 4.3949933, 4.8942895,
       4.591283 , 4.8508935, 4.5573044, 4.6417017, 4.3985767, 4.24268  ,
       3.9623451, 4.7027354], dtype=float32)

In [98]:
len(embedding_scores)

2234

In [100]:
from scipy.stats import pearsonr


# todo: proper benchmark
def eval_embedding_correlation(t, truth_vector):
    return pearsonr(t, truth_vector)[0]

eval_embedding_correlation(embedding_scores, train_sentences.score.values)

0.3679310312104844

In [None]:
# todo: end-to-end pipeline for benchmarking models
#  def embedding_pipeline(tr, test, encode_func, preprocess_func):