In [29]:
import gc
import sys
import time
import json
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
from imp import reload
from scipy import sparse
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.legend as mlegend
from gensim.models import Word2Vec

path = "/home/grace/walking/accelerate_discoveries" # path to the repository accelerate-discoveries
sys.path.insert(0,path)
import literature, embedding, utils, evaluation

%matplotlib inline

In [2]:
path_to_VM = "/home/grace/walking/VertMat.npz"   #  path to the vertex matrix file
VM = sparse.load_npz(path_to_VM)
path_to_matnames = "/home/grace/walking/mats_names.txt"   # path to the text file that contains our extracted material names
mats = open(path_to_matnames,'r', encoding = "utf-8").read().splitlines()
path_to_propnames = "/home/grace/walking/props_names.txt"  # path to the text file that contains properties we considered 
props = open(path_to_propnames,'r').read().splitlines()

In [3]:
# finally, time to create a hypergraph
h = literature.hypergraph(VM, mats, props)

# Running Random Walk Sampling

In [4]:
# ignore this line for now
h.Rcsr = h.R.tocsr()

# Training a Word2Vec

Now you can start playing with the output of the random walk sampling (e.g. train a word2vec on them,etc.). I will also add more stuff to this section later and share with you. 

### Preprocessing the Data

In [5]:
# load the random walk sequences
path_to_rws_L80x5M = "/home/grace/walking/nodes_L80_5M.txt"    # this should be the path to the text file that includes the random walk sequences 
rws_L80x5M = open(path_to_rws_L80x5M,'r', encoding = "utf-8").read().splitlines()

In [6]:
# the first step, would be to remove the author nodes from the node sequences
pruned_rws_L80x5M = utils.remove_authors_from_RW(rws_L80x5M)

# after removing authors, some sequences might have only one remaining node
# we remove those so that the resulting random walks would have at least two nodes (at least one step)
pruned_rws_L80x5M = [x for x in pruned_rws_L80x5M if len(x.split(' '))>1]

In [7]:
print("Number of all random walks: {}".format(len(rws_L80x5M)))
print("Number of random walks after removing authors: {}".format(len(pruned_rws_L80x5M)))
print("\nExample random walk: {}".format(pruned_rws_L80x5M[100]))

Number of all random walks: 23278
Number of random walks after removing authors: 23071

Example random walk: functional__groups/imine functional__groups/aromatic methylphenidate Beacon Beacon functional__role/dye disodium pro rifampicin clarithromycin isoniazid estrogen thymoquinone MDA amino__acid heme Hydrogen diphenhydramine__HCl acetaminophen HeH 2PA pyrene spermine xylose mannitol atropine Calcium cGMP biflavone C18:3 NaCl fructose imidacloprid γ-aminobutyric__acid citrate


In [8]:
# save it somewhere, because we will use the path to create the embedding model
path_to_pruned_rws_L80x5M = "/home/grace/walking/pruned_rws_L80x5M.txt" #file path to save the pruned random walks (author-less sequences)
open(path_to_pruned_rws_L80x5M,'w', encoding = "utf-8").write('\n'.join(pruned_rws_L80x5M)+'\n') # returns total number of lines
#open: 'w' open for writing

6832132

### Creating the Embedding Model

In [9]:
# instantiate a deepwalk-word2vec object: 
# we need to input the path to pruned random walks, the model will itself load the file later on
M = embedding.dww2v(path_to_pruned_rws_L80x5M,depth=0)

In [10]:
# if we don't provide any optional inputs, all parameters take their default values
M.pars

{'depth': 0,
 'phrase_min_count': 10,
 'phrase_threshold': 15,
 'size': 200,
 'window': 8,
 'min_count': 5,
 'sg': True,
 'hs': True,
 'workers': 20,
 'negative': 15,
 'start_alpha': 0.001,
 'end_alpha': 0.0001,
 'subsample': 0.0001,
 'batch': 5000,
 'epochs': 5}

In [11]:
# these few lines are just to specify how pythpn will display/store the logs
M.logger.handlers = []
logging.root.handlers = [logging.StreamHandler()]
logging.root.setLevel(logging.INFO)

Now let's create the embedding model. At this step, the model will use the following parameters:
* `depth`: a parameter specifying the number of tokens that can be included in a phrase. Note that in regular texts, it makes a lot of sense to work with phrases of various lengths, e.g., "bank_of_america", which is a phrase of length 3, but in deepwalk it is not as clear if phrasing will be useful. However, in our initial experiments, we found out that phrasing with `depth=2` improves the result. Setting `depth=0` (by running `embedding.dww2v(path_to_pruned_rws, depth=0)` at the time of instantiation or running `M.pars['depth']=0` after instantiation) will skip the phrasing step.
* `phrase_min_count`: A parameter that tells the function to ignore all words and bigrams with total collected count lower than this value
* `phrase_threshold`: Represents a score threshold for forming the phrases (higher means fewer phrases). This parameter is tightly connected to the built-in functions that gensim use to perform phrasing.

In [12]:
# this will take a few minutes, and display many lines of information
M.build_model()
path_to_M_L80x5M = "/home/grace/walking/M_L80x5M.model"
M.save_model(path_to_M_L80x5M)

Parsing lines (sentences) in: /home/grace/walking/pruned_rws_L80x5M.txt: 
Parameters for parsing phrases are as follows:
	depth: 0
	phrase_min_count: 10
	phrase_threshold: 15
Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=200, alpha=0.001)', 'datetime': '2022-06-14T21:45:48.826594', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 06:56:58) \n[GCC 7.5.0]', 'platform': 'Linux-5.13.0-48-generic-x86_64-with-glibc2.34', 'event': 'created'}
collecting all words and their counts
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 251607 words, keeping 33617 word types
PROGRESS: at sentence #20000, processed 503066 words, keeping 48907 word types
collected 52596 word types from a corpus of 580812 raw words and 23071 sentences
Creating a fresh vocabulary
Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 10819 unique words (20.570005323598753%% of original 52596, drops 41777)', 'datetime': '2022-06-1

In [13]:
vocab_arr = np.array(list(M.model.wv.key_to_index))
print("Number of words in the model's vocabulary: {}".format(len(vocab_arr)))
print("\nExample words:\n{}".format(vocab_arr[np.random.RandomState(10).randint(0,len(vocab_arr),20)].tolist()))

Number of words in the model's vocabulary: 10819

Example words:
['As(III)', 'BTO', 'benzoic__acid', 'coumestrol', 'perchlorates', 'phosphatidylmonomethylethanolamine', 'hydroxysteroid', 'Cr(III)', 'GEP', 'ethionine', '2-methoxyethanol', 'cystamine', 'polyhedrin', 'HPA', 'PpIX', 'Vp', 'AMP', 'chlorhexidine__gluconate', 'crystalline__silica', 'acetoacetate']


At this time, the model is not trained yet. Let's train it for a few epochs. At this point, the model will use all the other parameters
* `size`: size of the embedding vector
* `window`: size of neighborhood in selecting the nearby words
* `min_count`: the model will ignore words with counts smaller than this threshold
* `sg`: to use **skipgram** word2vec or the other variant (i.e., CBOW) 
* `hs`: to use heirarchical softmax or not (this is a trick for training the model less expensively; you might not need to know details at this time, but if you are curious see the paper)
* `workers`: number of worker threads to use for training
* `negative`: number of negative samples to be used in negative sampling (this is another popular trick in training word2vec;you might not need to know details at this time, but if you are curious see the paper).
* `start_alpha`: initial learning rate
* `end_alpha`: the ending learning rate; the learning rate starts from `start_alpha` in the beginning of the training and linearly drops to `end_alpha` as the training progresses
* `subsample`: the threshold for configuring which higher-frequency words are randomly downsampled
* `batch`: number of words in the training batches of examples 
* `epochs`: number of training epochs

In [14]:
# run the training for five epochs (default value for parameter "epochs")
# (this also takes some time and prints a lot of information)
model = Word2Vec.load(path_to_M_L80x5M)
M.train()

loading Word2Vec object from /home/grace/walking/M_L80x5M.model
loading wv recursively from /home/grace/walking/M_L80x5M.model.wv.* with mmap=None
setting ignored attribute cum_table to None
Word2Vec lifecycle event {'fname': '/home/grace/walking/M_L80x5M.model', 'datetime': '2022-06-14T21:45:49.707908', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 06:56:58) \n[GCC 7.5.0]', 'platform': 'Linux-5.13.0-48-generic-x86_64-with-glibc2.34', 'event': 'loaded'}
Training the model using the following parameters:
	phrase_min_count: 10
	size: 200
	window: 8
	min_count: 5
	sg: True
	hs: True
	workers: 20
	negative: 15
	start_alpha: 0.001
	end_alpha: 0.0001
	subsample: 0.0001
	batch: 5000
	epochs: 5
The model will be saved in None
Word2Vec lifecycle event {'msg': 'training model with 20 workers on 10819 vocabulary and 200 features, using sg=1 hs=1 sample=0.0001 negative=15 window=8 shrink_windows=True', 'datetime': '2022-06-14T21:45:49.716115', 'gensim': '4.1.2', 'python': '3.9.12 (main,

# Evaluating the Trained Model

In [15]:
props = {'state': ['crystalline', 'semicrystalline', 'amorphous', 'liquid'],
         'composition': ['organic', 'hybrid', 'inorganic', 'metallic'],
         'conjugation': ['conjugated', 'non-conjugated'],
         'porosity': ['microporous', 'mesoporous', 'non-porous'],
         'form': ['particles', 'nanoparticles', 'solid', 'liquid'],
         'hydrophilicity': ['hydrophobic', 'hydrophilic'],
         'dispersibility': ['dispersible',
                            'poorly__dispersible',
                            'non-dispersible',
                            'soluble'],
         'conductivity': ['conducting', 'semi-conducting', 'insulating'],
         'functional__role': ['light__absorber',
                              'cocatalyst',
                              'electron__scavenger',
                              'hole__scavenger',
                              'dye',
                              'solvent',
                              'surfactant',
                              'her__catalyst',
                              'oer__catalyst',
                              'overall__ws__catalyst',
                              'ph__modifier',
                              'dielectric__modifier'],
         'water__solubility': ['soluble', 'insoluble', 'sparingly__soluble'],
         'functional__groups': ['aromatic',
                                'aliphatic',
                                'carboxylic__acid',
                                'ester',
                                'imine',
                                'sulfone',
                                'triazine']
        }

## Evaluating Effect of Parameters

### Example Property Infers

In [77]:
def dict_to_list(dictionary):
    prop_infers_list = [dictionary[x] for x in dictionary]
    return prop_infers_list
def preturb_infers(size = 200, window = 8, workers = 20, negative = 15, epochs = 5):
    M.pars['size'] = size
    M.pars['window'] = window
    M.pars['workers'] = workers
    M.pars['negative'] = negative
    M.pars['epochs'] = epochs
    M.train()
    
    infers = []
    for mat in ['NaCl','(1)H','glucose','acetone']:
        scores, prop_infers = evaluation.infer_props(M.model, mat, props)
#         print(*dict_to_list(prop_infers), sep="\n")
#         print("---")
        infers.append(dict_to_list(prop_infers))
    return(infers)

mat_prop_tuples = [(x,y) for x in ["NaCl","H","glucose","acetone"]\
                  for y in ["state","composition","conjugation","porosity","form","hydrophilicity","dispersibiity","conductivity","functional__role","water__solubility","functional__groups"]]
def infers_df(parameter, preturb_arange):
    df = pd.DataFrame(index=pd.MultiIndex.from_tuples(mat_prop_tuples, names = ["mat","prop"]))
    for i in preturb_arange:
        infers = preturb_infers(parameter = i)
        df[parameter + " = " + str(i)] = [prop for mat in infers for prop in mat] # turn list of lists into list
    return df

In [67]:
def echo(size = 2, window = 4):
    echo_size = size
    echo_window = window
    return(echo_size, echo_window)

# echo()
# echo(size = 5)


x = "size"
# echo(vars()["x"] = 3)
exec("%s = %d" % (x, 50))
print(x)
size
echo(x)

# x = 'size'

# preturb_infers(size = 2)
# preturb_infers(x = 2)
# preturb_infers(vars()["x"] = 2)
# preturb_infers(exec("%s" % (x)) = 2)

size


('size', 4)

In [71]:
mat_prop_tuples = [(x,y) for x in ["NaCl","H","glucose","acetone"]\
                  for y in ["state","composition","conjugation","porosity","form","hydrophilicity","dispersibiity","conductivity","functional__role","water__solubility","functional__groups"]]
df_size = pd.DataFrame(index=pd.MultiIndex.from_tuples(mat_prop_tuples, names = ["mat","prop"]))
for i in np.arange(100,200+50,50):
    infers = preturb_infers(size = i)
    df_size[i] = [prop for mat in infers for prop in mat] # turn lol into list
df_size

Training the model using the following parameters:
	phrase_min_count: 10
	size: 100
	window: 8
	min_count: 5
	sg: True
	hs: True
	workers: 20
	negative: 15
	start_alpha: 0.001
	end_alpha: 0.0001
	subsample: 0.0001
	batch: 5000
	epochs: 5
The model will be saved in None
Effective 'alpha' higher than previous training cycles
Word2Vec lifecycle event {'msg': 'training model with 20 workers on 10819 vocabulary and 200 features, using sg=1 hs=1 sample=0.0001 negative=15 window=8 shrink_windows=True', 'datetime': '2022-06-14T22:42:20.941171', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 06:56:58) \n[GCC 7.5.0]', 'platform': 'Linux-5.13.0-48-generic-x86_64-with-glibc2.34', 'event': 'train'}
EPOCH 1 - PROGRESS: at 22.47% examples, 66982 words/s, in_qsize 39, out_qsize 1
EPOCH 1 - PROGRESS: at 51.60% examples, 76674 words/s, in_qsize 39, out_qsize 0
EPOCH 1 - PROGRESS: at 82.46% examples, 81939 words/s, in_qsize 21, out_qsize 0
worker thread finished; awaiting finish of 19 more thre

Unnamed: 0_level_0,Unnamed: 1_level_0,100,150,200
mat,prop,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NaCl,state,liquid,amorphous,amorphous
NaCl,composition,organic,organic,organic
NaCl,conjugation,conjugated,conjugated,non-conjugated
NaCl,porosity,mesoporous,mesoporous,mesoporous
NaCl,form,nanoparticles,nanoparticles,nanoparticles
NaCl,hydrophilicity,hydrophobic,hydrophobic,hydrophobic
NaCl,dispersibiity,non-dispersible,non-dispersible,non-dispersible
NaCl,conductivity,semi-conducting,semi-conducting,semi-conducting
NaCl,functional__role,solvent,solvent,solvent
NaCl,water__solubility,insoluble,soluble,insoluble


In [72]:
df_window = pd.DataFrame(index=pd.MultiIndex.from_tuples(mat_prop_tuples, names = ["mat","prop"]))
for i in np.arange(8,20,2):
    infers = preturb_infers(size = i)
    df_window[i] = [prop for mat in infers for prop in mat] # turn lol into list
df_window

Training the model using the following parameters:
	phrase_min_count: 10
	size: 8
	window: 8
	min_count: 5
	sg: True
	hs: True
	workers: 20
	negative: 15
	start_alpha: 0.001
	end_alpha: 0.0001
	subsample: 0.0001
	batch: 5000
	epochs: 5
The model will be saved in None
Effective 'alpha' higher than previous training cycles
Word2Vec lifecycle event {'msg': 'training model with 20 workers on 10819 vocabulary and 200 features, using sg=1 hs=1 sample=0.0001 negative=15 window=8 shrink_windows=True', 'datetime': '2022-06-14T22:44:02.668219', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 06:56:58) \n[GCC 7.5.0]', 'platform': 'Linux-5.13.0-48-generic-x86_64-with-glibc2.34', 'event': 'train'}
EPOCH 1 - PROGRESS: at 21.52% examples, 65837 words/s, in_qsize 40, out_qsize 1
EPOCH 1 - PROGRESS: at 58.41% examples, 89264 words/s, in_qsize 39, out_qsize 0
worker thread finished; awaiting finish of 19 more threads
worker thread finished; awaiting finish of 18 more threads
worker thread finis

Unnamed: 0_level_0,Unnamed: 1_level_0,8,10,12,14,16,18
mat,prop,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NaCl,state,amorphous,amorphous,amorphous,liquid,amorphous,liquid
NaCl,composition,inorganic,inorganic,inorganic,inorganic,inorganic,inorganic
NaCl,conjugation,non-conjugated,non-conjugated,non-conjugated,non-conjugated,non-conjugated,non-conjugated
NaCl,porosity,mesoporous,mesoporous,mesoporous,mesoporous,microporous,microporous
NaCl,form,liquid,liquid,particles,particles,particles,nanoparticles
NaCl,hydrophilicity,hydrophobic,hydrophobic,hydrophobic,hydrophobic,hydrophobic,hydrophobic
NaCl,dispersibiity,non-dispersible,non-dispersible,non-dispersible,non-dispersible,non-dispersible,non-dispersible
NaCl,conductivity,semi-conducting,semi-conducting,semi-conducting,semi-conducting,semi-conducting,semi-conducting
NaCl,functional__role,hole__scavenger,hole__scavenger,hole__scavenger,hole__scavenger,cocatalyst,light__absorber
NaCl,water__solubility,soluble,soluble,soluble,soluble,soluble,soluble


### Property Standard Deviations

In [None]:
def preturb_scores(samples = 5, size = 200, window = 8, workers = 20, negative = 15, epochs = 5):
    M.pars['size'] = size
    M.pars['window'] = window
    M.pars['workers'] = workers
    M.pars['negative'] = negative
    M.pars['epochs'] = epochs
    all_scores = pd.DataFrame()
    for i in range(samples):
        M.train()
        scores_dict, prop_infers = evaluation.infer_props(M.model,'NaCl',props)
        scores = []
        for value in scores_dict.values(): # getting all the numbers in the dictionary
            for v in value.values():
                scores.append(v)
        all_scores[i] = scores
    return all_scores.std(axis = 1)

In [None]:
df = pd.DataFrame()
df['test'] = preturb_scores().values
df