In [13]:
import os
import gensim
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [14]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [15]:
#train_corpus
#test_corpus

In [16]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)


In [17]:
model.build_vocab(train_corpus)

In [18]:
print(f"Word 'australia' appeared {model.wv.get_vecattr('australia', 'count')} times in the training corpus.")

Word 'australia' appeared 157 times in the training corpus.


In [19]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [20]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.1782975  -0.23964614 -0.0591245   0.26906997 -0.15845346 -0.03154698
  0.1788166   0.0704587  -0.1866261  -0.25290042  0.1130288  -0.0575174
 -0.0129105  -0.0464208  -0.08471344 -0.17739466  0.12153933  0.2483851
  0.14935376 -0.1553386   0.07355624  0.04982702  0.17672293 -0.00774762
  0.00541071 -0.00128167 -0.26569584 -0.04401922 -0.22405891  0.03789051
  0.47748864  0.08785544  0.19701801  0.09637775  0.20041974  0.16570048
  0.07937781 -0.24823871 -0.06655582  0.13243547  0.0256771   0.01165384
 -0.09564494 -0.16368495  0.25024927  0.06443783 -0.06886188 -0.12633361
  0.26553896  0.04259641]


In [21]:
sentence = ['only', 'you', 'can', 'prevent', 'forest', 'fires']
vec = 0
for element in sentence:
    element = model.infer_vector([element])
    vec = vec + element
    
vec=vec/len(sentence)
        
    

In [22]:
vec

array([-0.2894506 , -0.013359  , -0.06312678,  0.03533021, -0.0434816 ,
       -0.04575795,  0.11353608,  0.10210916, -0.08604094, -0.12737602,
        0.0329114 , -0.14699627, -0.02181017, -0.04626352, -0.08458331,
       -0.01893127,  0.12380761,  0.09419962, -0.00783238, -0.09704915,
        0.15301687,  0.07604394,  0.2701662 , -0.03957518,  0.09667271,
       -0.03059909, -0.18703473, -0.05757571, -0.12002975, -0.08114789,
        0.10336646, -0.01456281, -0.02218735,  0.1329421 , -0.02372598,
        0.06865006,  0.07907959, -0.04644291,  0.00589481,  0.03976449,
        0.13281037,  0.02488797, -0.09627198, -0.05304092,  0.20606233,
        0.08418542,  0.12716909, -0.11845258,  0.1461762 ,  0.02632777],
      dtype=float32)

In [23]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [24]:
second_ranks

[(48, 0.8956716656684875),
 (143, 0.7229611277580261),
 (21, 0.8727875351905823),
 (57, 0.692356288433075),
 (272, 0.7355289459228516),
 (84, 0.6316609382629395),
 (17, 0.8079784512519836),
 (46, 0.8275448083877563),
 (48, 0.882240355014801),
 (8, 0.7974414825439453),
 (264, 0.8062855005264282),
 (188, 0.7444909811019897),
 (26, 0.7111110091209412),
 (74, 0.7225245833396912),
 (44, 0.8715969324111938),
 (27, 0.8408768773078918),
 (139, 0.8611551523208618),
 (6, 0.8330761194229126),
 (71, 0.6782414317131042),
 (40, 0.7895879745483398),
 (150, 0.6874862313270569),
 (43, 0.8796451091766357),
 (179, 0.7734436392784119),
 (56, 0.749056339263916),
 (113, 0.6727308034896851),
 (10, 0.8503091335296631),
 (12, 0.7720584273338318),
 (15, 0.8752089738845825),
 (56, 0.7213158011436462),
 (173, 0.7903585433959961),
 (121, 0.7272587418556213),
 (251, 0.7230939269065857),
 (258, 0.7044265270233154),
 (8, 0.875551700592041),
 (12, 0.6668563485145569),
 (127, 0.7380872964859009),
 (224, 0.5419083833694

In [25]:
train_corpus[4]

TaggedDocument(words=['six', 'midwives', 'have', 'been', 'suspended', 'at', 'wollongong', 'hospital', 'south', 'of', 'sydney', 'for', 'inappropriate', 'use', 'of', 'nitrous', 'oxide', 'during', 'work', 'hours', 'on', 'some', 'occasions', 'while', 'women', 'were', 'in', 'labour', 'the', 'illawarra', 'area', 'health', 'service', 'says', 'that', 'following', 'an', 'investigation', 'of', 'unprofessional', 'conduct', 'further', 'four', 'midwives', 'have', 'been', 'relocated', 'to', 'other', 'areas', 'within', 'the', 'hospital', 'the', 'service', 'chief', 'executive', 'officer', 'tony', 'sherbon', 'says', 'no', 'one', 'was', 'put', 'at', 'risk', 'because', 'other', 'staff', 'not', 'involved', 'in', 'the', 'use', 'of', 'nitrous', 'oxide', 'were', 'able', 'to', 'take', 'over', 'caring', 'for', 'women', 'in', 'labour', 'well', 'we', 're', 'very', 'concerned', 'and', 'the', 'body', 'of', 'midwives', 'to', 'the', 'hospital', 'there', 'are', 'over', 'midwives', 'that', 'work', 'in', 'our', 'servic

In [26]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [27]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [28]:
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)


SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):



In [29]:
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


MOST (299, 0.9498270750045776): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with h

In [30]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (147): «the federal government is negotiating with the united states and other countries about the fate of an adelaide man suspected of fighting alongside the taliban in afghanistan david hicks was captured by northern alliance forces three days ago and is now in the custody of american troops the head of the victorian law institute john corcoran says mr hicks could face charge of treason under australian law the only crime punishable by death if the circumstances are permitted he could be charged with treason under the commonwealth crimes act which does have the death penalty he said if charged under the foreign incursion act the penalty is less severe carrying maximum year jail sentence but the defence minister robert hill says it is too early to say whether the government will take legal action these issues are being looked at but he was only captured few days ago and it difficult to ascertain all the facts senator hill said the government is currently holding talks w

In [31]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (46): «the river elbe surged to an all time record high friday flooding more districts of the historic city of dresden as authorities scrambled to evacuate tens of thousands of residents in the worst flooding to hit central europe in memory in the czech republic authorities were counting the cost of the massive flooding as people returned to the homes and the vlava river receded revealing the full extent of the damage to lives and landmarks»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (10, 0.755786120891571): «work is continuing this morning to restore power supplies to tens of thousands of homes that were blacked out during wild storms that struck south east queensland last night gale force winds uprooted trees and brought down power lines damaging homes and cars energex and ergon energy have had every available person working through the night to restore power at locations in and around brisbane west to toowoomba and north to the suns

### Challenge: Use the senator speeches in the folder 105-extracted-date and use doc2vec to find whose senator speech is closest to senator Biden. Use sen105kh_fix.csv and/or Wikipedia to validate your findings (i.e., understand if the most similar speeches are senators from the same state and/party).  Describe your findings. Compare with the outcome you got/will get using cosine similarity.

In [32]:
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html

In [33]:
import os
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")
tqdm.pandas()
def format_congress_text(text_list):
    df = pd.DataFrame(text_list, columns=["text_raw"])

    df["text_raw"] = df["text_raw"].str.split("\n</TEXT>\n</DOC>\n\n<DOC>\n<DOCNO>")

    df = df.explode("text_raw")

    df["text_raw"] = df["text_raw"].str.replace("<DOC>\n<DOCNO>", "")

    df["meta"] = df["text_raw"].str.split("</DOCNO>\n<TEXT>\n").str[0]
    df["congress"] = df["meta"].str[:3]
    df["speaker"] = df["meta"].str.split("-").str[1]
    df["state"] = df["meta"].str.split("-").str[2]
    df["date"] = df["meta"].str.split("-").str[4]

    df["text_raw"] = (
        df["text_raw"]
        .str.split("</DOCNO>\n<TEXT>\n")
        .str[1]
        .str.strip()
        .str.split(".")
        .str[2:]
        .str.join(sep="")
        .str.strip()
    )

    df["text_raw"] = df["text_raw"] + " "

    # now join this back to congress / speaker level

    df = df.groupby(["congress", "speaker"])["text_raw"].sum().reset_index()

    # drop if a congressman did not speek
    df = df.loc[lambda x: x["text_raw"].apply(type) == str]
    return df

In [34]:
#os.chdir(r"C:\Users\molda\Downloads")
 

In [35]:
droplist = open("ML-for-NLP-main/Inputs/droplist.txt", encoding="utf-8", newline="\n").read()
droplist = [i.replace('"', "") for i in droplist.split("\n")]
stop_words = stopwords.words("english")
stopwords_final = droplist + stop_words + ['s','nt', "n't", "'s", "--"]

In [36]:
text_105 = [
    open("ML-for-NLP-main/Inputs/105-extracted-date/" + i, encoding="latin").read()
    for i in os.listdir("ML-for-NLP-main/Inputs/105-extracted-date/")
]

df = format_congress_text(text_105)

In [37]:
from nltk.tokenize import word_tokenize
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('s$|ies$')
def preprocessing_text(text):
    words = word_tokenize(text.lower())
    tokens = [word for word in words if word not in string.punctuation]
    tokens = [token for token in tokens if token not in stopwords_final]
    stemmer = RegexpStemmer('s$|ies$') 
    tokens_lematized = [stemmer.stem(word) for word in tokens]
    preprocessed_text = " ".join(tokens_lematized)
    return preprocessed_text
vectorizer = TfidfVectorizer(preprocessor = preprocessing_text)
dtm_tfidf = vectorizer.fit_transform(df["text_raw"])
df_tfidf = pd.DataFrame(dtm_tfidf.toarray(), columns=vectorizer.get_feature_names())

In [38]:
df

Unnamed: 0,congress,speaker,text_raw
0,105,abraham,"Mr President, during debate on final passage o..."
1,105,akaka,"Mr President, I am pleased that the Senate pas..."
2,105,allard,"Mr President, I rise to make a few remarks con..."
3,105,ashcroft,"Mr President, the Senate is not in order I wou..."
4,105,baucus,I understand that the House has sent the Senat...
...,...,...,...
94,105,thurmond,"Mr President, as the Senate considers HR 2263,..."
95,105,torricelli,"Mr President I thank Senator Snowe, Senator Mc..."
96,105,warner,"During the past two weeks, the Senate Armed Se..."
97,105,wellstone,"Mr President, today, I would like to call atte..."


In [39]:
"""import re

def listofwords(text):
    sentences = [re.sub(pattern=r"[\!'#$%&\*+,-./:;<=>?@^_`()|~=]", 
                        repl='', 
                        string=x
                       ).lower().strip().split(' ') for x in text.split('\n')]
    sentences = [x for x in sentences if x != ['']]
    sentences = [item for sublist in sentences for item in sublist]
    return sentences"""

'import re\n\ndef listofwords(text):\n    sentences = [re.sub(pattern=r"[\\!\'#$%&\\*+,-./:;<=>?@^_`()|~=]", \n                        repl=\'\', \n                        string=x\n                       ).lower().strip().split(\' \') for x in text.split(\'\n\')]\n    sentences = [x for x in sentences if x != [\'\']]\n    sentences = [item for sublist in sentences for item in sublist]\n    return sentences'

In [40]:
"""df['word_list']= df['text_raw'].apply(lambda x: listofwords(x))"""

"df['word_list']= df['text_raw'].apply(lambda x: listofwords(x))"

In [41]:
train=df.text_raw.values.tolist()

In [42]:
from gensim.models.doc2vec import TaggedDocument
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(train)]

In [43]:
len(tagged_data)

99

In [44]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=80)

In [45]:
model.build_vocab(tagged_data)

In [46]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)

In [47]:
model.save("d2v.model")

In [48]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load("d2v.model")

In [49]:
df[df.speaker=="biden"]

Unnamed: 0,congress,speaker,text_raw
6,105,biden,"Mr President, I am pleased that the Senate tod..."


In [50]:
tagged_data[6].words

['mr',
 'president',
 ',',
 'i',
 'am',
 'pleased',
 'that',
 'the',
 'senate',
 'today',
 'is',
 'passing',
 'the',
 'hatch-biden-lautenberg',
 'substitute',
 'amendment',
 'to',
 'hr',
 '4164',
 ',',
 'and',
 'i',
 'am',
 'hopeful',
 'that',
 'the',
 'other',
 'body',
 'will',
 'take',
 'up',
 'and',
 'pass',
 'the',
 'measure',
 'before',
 'congress',
 'adjourns',
 'for',
 'the',
 'year',
 'what',
 'this',
 'legislation',
 'does',
 'is',
 'simple',
 'under',
 'current',
 'federal',
 'law',
 ',',
 'states',
 'must',
 'give',
 'full',
 'faith',
 'and',
 'credit',
 'to',
 'the',
 'child',
 'custody',
 'orders',
 'of',
 'another',
 'state',
 'a',
 'custody',
 'order',
 'is',
 'defined',
 'as',
 'including',
 'a',
 'visitation',
 'order',
 'however',
 ',',
 'as',
 'evidence',
 'from',
 'around',
 'the',
 'country',
 'has',
 'shown',
 ',',
 'state',
 'courts',
 'often',
 'do',
 'not',
 'automatically',
 'recognize',
 'visitation',
 'orders',
 ',',
 'particularly',
 'when',
 'it',
 'is',
 

In [51]:
similar_doc = model.docvecs.most_similar(6)


In [52]:
print(similar_doc[0])

('27', 0.8169605135917664)


In [53]:
df.speaker.iloc[27]

'dewine'

Doc2vec suggests that the most similar speech to biden is by senator dewine.

In [54]:
print(str(model))
print(model.docvecs.most_similar(positive=[6], topn=20))

Doc2Vec(dm/m,d30,n5,w5,mc2,s0.001,t3)
[('27', 0.8169605135917664), ('48', 0.8006446957588196), ('38', 0.7826368808746338), ('34', 0.7747900485992432), ('65', 0.7662824988365173), ('15', 0.7255178093910217), ('77', 0.7114607095718384), ('3', 0.693520724773407), ('45', 0.6930956244468689), ('42', 0.6926093101501465), ('64', 0.6795579791069031), ('5', 0.6672008037567139), ('58', 0.6634320616722107), ('67', 0.6591944694519043), ('39', 0.6571681499481201), ('71', 0.6527324318885803), ('26', 0.6369012594223022), ('41', 0.6354937553405762), ('35', 0.5891538858413696), ('61', 0.5875715017318726)]


Below are the top5 most similar senator speeches by doc2vec:

In [55]:
df.speaker.iloc[[27, 48, 38, 34, 65]]

27      dewine
48       helms
38       glenn
34    feingold
65       levin
Name: speaker, dtype: object

In [56]:
top5=model.docvecs.most_similar(positive=[6], topn=5)
top5

[('27', 0.8169605135917664),
 ('48', 0.8006446957588196),
 ('38', 0.7826368808746338),
 ('34', 0.7747900485992432),
 ('65', 0.7662824988365173)]

In [57]:
D2V=model.docvecs.most_similar(positive=[6], topn=99)

In [58]:
d2vdf=pd.DataFrame(D2V, columns=["index","d2vsimilarity"])

In [59]:
import numpy as np
d2vdf = d2vdf.astype({"index": int, "d2vsimilarity": np.float16})

In [60]:
d2vdf=d2vdf.set_index("index").sort_index()

In [61]:
ddf=df.join(d2vdf)

In [62]:
ddf[ddf.speaker=="biden"]

Unnamed: 0,congress,speaker,text_raw,d2vsimilarity
6,105,biden,"Mr President, I am pleased that the Senate tod...",


In [63]:
biden=df.loc[df["speaker"] == "biden", "text_raw"].item()

In [64]:
biden = preprocessing_text(biden)

In [65]:
biden



In [66]:
from sklearn.metrics.pairwise import cosine_similarity
biden_vector = df_tfidf.loc[df["speaker"] == "biden"].values.reshape(1, -1)

docs = df_tfidf.values

similarity = []
for doc in docs:
    similarity.append(cosine_similarity(doc.reshape(1,-1), biden_vector))
ddf["text_similarity_to_biden"] = [i[0][0] for i in similarity]
df_105kh=ddf.sort_values(by=["text_similarity_to_biden"], ascending=False).head(12).merge(
    pd.read_csv("ML-for-NLP-main/Inputs/sen105kh_fix.csv", sep=";"),
    left_on="speaker",
    right_on="lname",
    how="left",
)
ddf.sort_values(by=["text_similarity_to_biden"], ascending=False).head(6).merge(
    pd.read_csv("ML-for-NLP-main/Inputs/sen105kh_fix.csv", sep=";"),
    left_on="speaker",
    right_on="lname",
    how="left",
)

Unnamed: 0,congress,speaker,text_raw,d2vsimilarity,text_similarity_to_biden,cong,lname,stateab,lstate,id,dist,party
0,105,biden,"Mr President, I am pleased that the Senate tod...",,1.0,105,biden,de,DELAWAR,14101,0,100
1,105,smith,"Mr President, I rise to indicate my strong sup...",0.470459,0.801164,105,smith,nh,NEW HAM,15116,0,200
2,105,smith,"Mr President, I rise to indicate my strong sup...",0.470459,0.801164,105,smith,or,OREGON,49705,0,200
3,105,roberts,I thank the Chair I thank the Presiding Office...,0.544922,0.790257,105,roberts,ks,KANSAS,14852,0,200
4,105,kerry,"Mr President, I would ask my distinguished col...",0.558594,0.786655,105,kerry,ma,MASSACH,14920,0,100
5,105,warner,"During the past two weeks, the Senate Armed Se...",0.583496,0.784996,105,warner,va,VIRGINI,14712,0,200
6,105,hutchison,I am happy to yield to the Senator from Indian...,0.54834,0.782028,105,hutchison,tx,TEXAS,49306,0,200


In [67]:
ddf.sort_values(by=["d2vsimilarity"], ascending=False).head(6).merge(
    pd.read_csv("ML-for-NLP-main/Inputs/sen105kh_fix.csv", sep=";"),
    left_on="speaker",
    right_on="lname",
    how="left",
)

Unnamed: 0,congress,speaker,text_raw,d2vsimilarity,text_similarity_to_biden,cong,lname,stateab,lstate,id,dist,party
0,105,dewine,"Mr President, HR 1023, the Ricky Ray Hemophili...",0.816895,0.705181,105,dewine,oh,OHIO,15020,0,200
1,105,helms,"Mr President, following Senate approval of the...",0.800781,0.379547,105,helms,nc,NORTH C,14105,0,200
2,105,glenn,"Mr President, I would like to thank my colleag...",0.782715,0.696281,105,glenn,oh,OHIO,14304,0,100
3,105,feingold,"Mr President, on October 15th, America lost it...",0.774902,0.754289,105,feingold,wi,WISCONS,49309,0,100
4,105,levin,"Mr President, on October 8th I made a statemen...",0.766113,0.76866,105,levin,mi,MICHIGA,14709,0,100
5,105,byrd,"Mr President, the United States Constitution c...",0.725586,0.778903,105,byrd,wv,WEST VI,1366,0,100


As it can be seen, top5 using d2vsimilarity and top5 using cosine similarity are completely different. There are no senators from DELAWAR in both top5 lists. They are also mostly from the other party in top5 by cosine similarity, but mostly from the same party in top5 by d2v similarity. Moreover, highly rated as similar by cosine similarity actually performed lower by d2v similarity, while highly rated as similar by d2v similarity also performed high on average (except for helms) by cosine similarity.