# Doc2Vec on ArXiv metadata

Testing the Doc2Vec model on ArXiv metadata subset.

In [3]:
import pandas as pd
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from os.path import expanduser

In [2]:
df = pd.read_json(expanduser('~/data/arxiv_snapshot/arxiv-metadata-oai-snapshot-subset.json'), lines=True)

# Preprocessing

In [3]:
# Lowercase and split
df['title'] = df['title'].str.lower().str.split()
df['abstract'] = df['abstract'].str.lower().str.split()

# Remove empty strings
df['title'] = df['title'].apply(lambda x: [i for i in x if i != ''])
df['abstract'] = df['abstract'].apply(lambda x: [i for i in x if i != ''])

# Strip newlines
df['title'] = df['title'].apply(lambda x: [x.strip() for x in x])
df['abstract'] = df['abstract'].apply(lambda x: [x.strip() for x in x])

In [4]:
# Get our tags and documents ready
docs = df['title'].tolist() + df['abstract'].tolist()
tags = df['id'].tolist()

tagged_docs = [TaggedDocument(doc, [tag]) for doc, tag in zip(docs, tags)]

# Model training

In [5]:
# Model parameters
doc2vec_params = {
    'vector_size': 512,
    'window': 10,
    'min_count': 1,
    'workers': 10
}

# Initialize the model
model = Doc2Vec(tagged_docs, **doc2vec_params)

In [None]:
# I swear Gensim decides at random if it wants the vocabulary before training

# train the model:
model.train(tagged_docs, total_examples=model.corpus_count, epochs=15)

In [6]:
# if it complains about the vocabulary:
model.build_vocab(tagged_docs)
model.train(tagged_docs, total_examples=model.corpus_count, epochs=15)

# Model evaluation

In [12]:
# Test the model by finding similar documents
sample_doc = df.sample(1)[['id', 'title', 'abstract']]
sample_doc.T

Unnamed: 0,12669
id,1912.12397
title,"[natural, language, processing, of, mimic-iii,..."
abstract,"[coding, diagnosis, and, procedures, in, medic..."


In [13]:
# Show the vectorized representation of the sample document
model.dv[sample_doc['id'].values[0]]

array([-3.54877044e-03,  1.63125247e-02,  1.61897503e-02,  4.75315098e-03,
       -4.33648527e-02,  1.64318439e-02, -2.05208920e-03,  5.33237755e-02,
        2.31122971e-03,  2.25703903e-02, -1.79683715e-02,  1.02105420e-02,
        4.01576906e-02, -1.74945910e-02,  2.74510700e-02,  4.13589627e-02,
       -4.22500446e-02,  5.42162135e-02, -2.37965886e-03, -3.81384604e-03,
        8.50930624e-03, -5.03005721e-02, -3.18076201e-02,  4.94432449e-03,
       -7.93076009e-02,  1.30698485e-02,  1.79558266e-02,  8.96868184e-02,
        2.99306121e-02, -1.68529041e-02, -4.82734255e-02,  6.89855888e-02,
       -2.63926666e-02,  2.21179649e-02,  2.37785894e-02, -2.39662491e-02,
       -4.80487607e-02,  2.24499591e-02,  8.20685923e-02, -4.48093228e-02,
        1.76392309e-02, -2.56158803e-02,  8.98221508e-02, -2.95773670e-02,
        1.32946912e-02, -3.84656526e-02,  2.70279553e-02,  1.28889069e-01,
       -5.38015962e-02, -8.77810083e-03,  6.64130598e-02,  3.34087200e-02,
        1.91011280e-02,  

In [14]:
# Average the vectors of the title + abstract and uses cosine similarity to find similar documents
similar_docs_to_sample = model.dv.most_similar(sample_doc['id'].values[0])

similar_docs_to_sample

[('2101.05326', 0.716742753982544),
 ('1812.06613', 0.6889414191246033),
 ('2212.08821', 0.6858463883399963),
 ('1707.08401', 0.6820381283760071),
 ('1806.05798', 0.662753164768219),
 ('2105.13137', 0.6583074927330017),
 ('2312.06914', 0.6539578437805176),
 ('2107.14070', 0.649836540222168),
 ('2203.07731', 0.6452046632766724),
 ('2004.08333', 0.6438530683517456)]

In [15]:
# keep only the ids
similar_docs_to_sample_ids = [i for i, _ in similar_docs_to_sample]

# Get the actual documents
similar_docs = df[df['id'].isin(similar_docs_to_sample_ids)][['id', 'title', 'abstract']]

for i, row in similar_docs.iterrows():
    print(row['id'])
    print(' '.join(row['title']))
    # print()
    # print(' '.join(row['abstract']))
    print('====================')

1707.08401
detecting and classifying lesions in mammograms with deep learning
1806.05798
satr-dl: improving surgical skill assessment and task recognition in robot-assisted surgery with deep neural networks
1812.06613
voiceprint recognition of parkinson patients based on deep learning
2004.08333
natural language processing with deep learning for medical adverse event detection from free-text medical narratives: a case study of detecting total hip replacement dislocation
2101.05326
advancing eosinophilic esophagitis diagnosis and phenotype assessment with deep learning computer vision
2105.13137
graph-based deep learning for medical diagnosis and analysis: past, present and future
2107.14070
machine learning advances aiding recognition and classification of indian monuments and landmarks
2203.07731
evaluating bert-based pre-training language models for detecting misinformation
2212.08821
context-dependent explainability and contestability for trustworthy medical artificial intelligence:

In [18]:
# Find the most similar documents to keywords
keywords = 'brain health'.split()

# Average the vectors of the keywords and uses cosine similarity to find similar documents
similar_docs_to_keywords = model.dv.most_similar([model.infer_vector(keywords)])

# Get the actual documents
similar_docs = df[df['id'].isin([i for i, _ in similar_docs_to_keywords])][['id', 'title', 'abstract']]

# Examine titles for most similar documents
for i, row in similar_docs.iterrows():
    print(row['id'])
    print(' '.join(row['title']))
    print('=======================================\n')

1408.2009
predictive genomics: a cancer hallmark network framework for predicting tumor clinical phenotypes using genome sequencing data

1708.06578
cascade and parallel convolutional recurrent neural networks on eeg-based intention recognition for brain computer interface

1911.05661
deep learning decoding of mental state in non-invasive brain computer interface

2106.09424
interpretable machine learning classifiers for brain tumour survival prediction

2107.03220
joint embedding of structural and functional brain networks with graph neural networks for mental illness diagnosis

2204.04777
multimodal machine learning in precision health

2206.05067
model-based machine learning of critical brain dynamics

2306.07519
decoding brain motor imagery with various machine learning techniques

2310.11266
emulating human cognitive processes for expert-level medical question-answering with large language models

2405.03235
cross-modal domain adaptation in brain disease diagnosis: maximum mean di

# Save files and model

In [11]:
# Save the dataframe of processed metadata
df.to_json(expanduser('~/data/arxiv_snapshot/arxiv-subset-processed.json'), orient='records', lines=True)

# Save the tagged documents
with open(expanduser('~/data/arxiv_snapshot/arxiv-subset-tagged-docs.txt'), 'w') as f:
    for doc in tagged_docs:
        f.write(f'{doc}\n')
        
# Save the model parameters
with open(expanduser('saved_models/doc2vec_params_V1'), 'w') as f:
    for key, value in doc2vec_params.items():
        f.write(f'{key}: {value}\n')

# Save the model
model.save('saved_models/doc2vec_V1.model')

# Save embeddings to a dataframe
embeddings = pd.DataFrame(model.dv.vectors, index=model.dv.index_to_key)
embeddings.to_csv(expanduser('saved_models/doc2vec_V1_embeddings.csv'))