In [1]:
import pandas as pd
import glob
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial
from myfuncs import my_tokenizer

In [2]:
paths = glob.glob("data/*_fulltext.csv")
lst = []
for x in paths:
    df = pd.read_csv(x, index_col=0)
    lst +=  [df]
    
df = pd.concat(lst).reset_index(drop = True)
df["tokens"] = df["fulltext"].apply(my_tokenizer)


In [3]:
docs = df.set_index("fulltext_id").loc[:, "tokens"].to_dict()

In [4]:
documents = []
for key in docs:
    documents += [TaggedDocument(docs[key], [str(key)])]

In [5]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [6]:
bigdct = {}
for key in docs.keys():
    #print(key)
    vec = model.dv[str(key)]
    #print(vec)
    bigdct[key] = {"vec" : vec}


In [7]:
vdf = pd.DataFrame.from_dict(bigdct, orient='index')

In [8]:
dct = {}
for x1, y1 in vdf.iterrows():
    for x2, y2 in vdf.iterrows():
        #print(x1, x2)
        #print(y1[0], y2[0])
        #print(1 - spatial.distance.cosine(y1[0], y2[0]))
        if x1 in dct.keys():
            dct[x1].update({x2 : (1 - spatial.distance.cosine(y1[0], y2[0]))})
        else:
            dct[x1] = {x2 : (1 - spatial.distance.cosine(y1[0], y2[0]))}

In [9]:
distance_df = pd.DataFrame(dct)

In [11]:
distance_df.to_csv("data/doc2vec_cosine_distance.csv")