# Import modules

In [2]:
import networkx as nx
import pandas as pd
import os
from itertools import combinations
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import StandardScaler

In [3]:
!python -V
!pip freeze

Python 3.10.12
absl-py==1.4.0
accelerate==1.2.1
aiohappyeyeballs==2.4.4
aiohttp==3.11.10
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.19
albumentations==1.4.20
altair==5.5.0
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.20.0
astropy==6.1.7
astropy-iers-data==0.2024.12.16.0.35.48
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==24.3.0
audioread==3.0.1
autograd==1.7.0
babel==2.16.0
backcall==0.2.0
beautifulsoup4==4.12.3
bigframes==1.29.0
bigquery-magics==0.4.0
bleach==6.2.0
blinker==1.9.0
blis==0.7.11
blosc2==2.7.1
bokeh==3.6.2
Bottleneck==1.4.2
bqplot==0.12.43
branca==0.8.1
CacheControl==0.14.1
cachetools==5.5.0
catalogue==2.0.10
certifi==2024.12.14
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.0
chex==0.1.88
clarabel==0.9.0
click==8.1.7
cloudpathlib==0.20.0
cloudpickle==3.1.0
cmake==3.31.2
cmdstanpy==1.2.5
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0

# Load a csv file listing presentation abstract
"presentations_raw.csv" includes abstract texts. This file cannot be made public in order to maintain confidentiality.

In [4]:
df_presentations = pd.read_csv('presentations_raw.csv')
#df_presentations.head()

#Embed abstract texts into vectors

In [5]:
documents = [TaggedDocument(gensim.utils.simple_preprocess(text), [i]) for i, text in enumerate(df_presentations['presentation_abstract'])]

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=40)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Get vectors corresponding to each presentation
vectors = [doc2vec_model.infer_vector(gensim.utils.simple_preprocess(text)) for text in df_presentations['presentation_abstract']]

vector_df = pd.DataFrame(vectors, columns=[f'vector_{i}' for i in range(len(vectors[0]))])
df_presentations = pd.concat([df_presentations, vector_df], axis=1)

# Hide raw texts of abstract and title

In [6]:
df_presentations['presentation_abstract'] = '***'
df_presentations['presentation_title'] = '***'
df_presentations.head()

Unnamed: 0,presentation_id,presentation_number,presentation_title,presentation_abstract,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,...,vector_40,vector_41,vector_42,vector_43,vector_44,vector_45,vector_46,vector_47,vector_48,vector_49
0,Poster8-01,P0270,***,***,0.522294,0.789739,-1.209882,-1.516428,1.027858,-0.618111,...,1.980572,-0.361716,-1.204104,0.751821,0.322392,0.004455,-0.069256,0.899388,1.017874,1.915452
1,Poster8-02,P0271,***,***,-0.012453,0.45863,-0.87041,-2.88884,-1.144509,0.045338,...,-0.13223,0.465507,-1.732241,0.661797,-0.294142,1.1028,1.422017,-0.104653,1.613843,1.813473
2,Poster8-03,P0272,***,***,0.293812,0.083503,1.651221,-2.299217,-0.458393,-2.4243,...,-0.378288,0.572768,-0.177905,0.059639,0.115081,-0.656539,0.34929,1.979954,-2.264795,1.400988
3,Poster8-04,P0273,***,***,-0.377429,-0.075077,-0.757414,-1.560815,0.822041,-1.624452,...,-0.780953,-1.246458,0.692005,-1.136234,1.178934,-0.344716,2.21096,1.30212,1.917406,0.835775
4,1-1-02,1-1-02,***,***,0.522241,1.485969,-0.302269,-3.227962,1.852223,-0.673345,...,-2.326968,2.053934,-1.029431,2.096001,0.850973,0.1431,1.185468,2.130687,1.546834,0.446001


# Save for next analysis

In [7]:
df_presentations.to_csv('presentations_vectors.csv', index=False)