In [None]:
import networkx as nx
import helper

train_data = helper.load_data_from_file("graph_data.json") # dictionary of SMILES and values are graphs

graph = train_data["C[CH2-].C[CH2-].[Zn+2]"]

In [3]:
graph

<networkx.classes.graph.Graph at 0x7f36eaf33be0>

In [4]:
train_data

{'[Ag]': <networkx.classes.graph.Graph at 0x7f3708100fa0>,
 'C/C(=C\\C(=O)C(F)(F)F)/O[Al](O/C(=C\\C(=O)C(F)(F)F)/C)O/C(=C\\C(=O)C(F)(F)F)/C': <networkx.classes.graph.Graph at 0x7f3708100d00>,
 'C(=C(\\O[Al](O/C(=C\\C(=O)C(F)(F)F)/C(F)(F)F)O/C(=C\\C(=O)C(F)(F)F)/C(F)(F)F)/C(F)(F)F)\\C(=O)C(F)(F)F': <networkx.classes.graph.Graph at 0x7f3708100580>,
 'C/C(=C/C(=O)C)/O[Al](O/C(=C\\C(=O)C)/C)O/C(=C\\C(=O)C)/C': <networkx.classes.graph.Graph at 0x7f3708100ca0>,
 'CC(/C(=C/C(=O)C(C)(C)C)/O[Al](O/C(=C\\C(=O)C(C)(C)C)/C(C)(C)C)O/C(=C\\C(=O)C(C)(C)C)/C(C)(C)C)(C)C': <networkx.classes.graph.Graph at 0x7f37081001f0>,
 '[Ar]': <networkx.classes.graph.Graph at 0x7f3708100b80>,
 '[As]12[As]3[As]1[As]23': <networkx.classes.graph.Graph at 0x7f3708100d60>,
 '[AsH3]': <networkx.classes.graph.Graph at 0x7f3708100340>,
 'C[As](C)C': <networkx.classes.graph.Graph at 0x7f3708101360>,
 'C1=CC=[As]C=C1': <networkx.classes.graph.Graph at 0x7f370020dc60>,
 'F[As](F)F': <networkx.classes.graph.Graph at 0x7f370020

In [5]:
print(" ")
print(f"We have {len(train_data.keys())} molecular graphs to train with.")
print("Please adhere to best practices during training.\n")

print("The networkx graphs in the graph_data.json file are lablled by SMILES strings")
print("For more details on SMILES, visit: https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system\n")
print("Below is an example of the graph nodes called by the SMILE string C[CH2-].C[CH2-].[Zn+2].\n")

print("Graph Nodes:")
for node, data in graph.nodes(data=True):
     print(f"Node {node}: {data}")

print("\nGraph Edges:")
for u, v, data in graph.edges(data=True):  
     print(f"Edge ({u}, {v}): {data}")
print(" ")

print("Let's examine the organization of the nodes by printing node 0:")
print("Remember, the index has no inherent meaning;")
print("your results should be consistent even if these indices are permuted.")
print("Permutation Invariance!\n")

print("'atom_type', 'formal_charge' and 'orbitals' are the attributes available for node featurization, 'binding_energies' are the node labels to predict:\n")
print("\t'atom_type' represents the atomic number, or the number of protons in an atom's nucleus.")
print("\t\tThis number differentiates elements, e.g., 6 for carbon and 1 for hydrogen.")
print("\t\tThis dataset contains 55 different atom types, use a one-hot encoding vector of length 55,")
print("\t\t... or try different atom representations available in the literature (SkipAtom, Mat2Vec ...)\n")

print("\t'formal_charge' is the integer charge of the atom in the molecule.\n")

print("\t'orbitals' is a vector of the orbitals types with binding energy values.")
print("\t\tThe length of this vector is same length as the binding energies vector.\n")

print("\t'binding_energies' is a vector of the output node labels to predict.\n")

print("The binding energy data is sparse, many of the orbitals and binding energies above are assigned -1 dummy values.\n")

print("Furthermore, some graph nodes will have many binding energy values.")
print("There are even some single atom graphs in the data.")
print("For example the Xe atom (SMILE string [Xe]) has many binding energies:\n")

atom_graph = train_data["[Xe]"]
print("Graph for [Xe]:")
print(atom_graph.nodes(data=True))
print(" ")

print("We suggest two ideas for handling the multidimensional nature of the outputs in the project presentation slides.\n") 

print("For the edges, the 'bond_type' feature is categorical and given as a string.\n") 
print("\tPossible types are SINGLE, DOUBLE, TRIPLE. use a one-hot encoding for the bond-types.")
print("\tIn the example above, Node 4 is not bonded to any other atoms.") 
print("\tTherefore you should turn this data into fully connected graphs and create another 'bond_type' catergory called NONE.\n")

print("One possible way to improve the predictions is to include more features to the graphs")
print("This can be done through modifying database_2_graph.py, see the comment in line 323 of this code")
print("This would will reduce the size of the traning data as the modification will induce rdkit errors,")
print("for some molecules.\n")

print("The following google colab tutorial is good starting point for molecular graph neural networks in PyTorch:\n")

print("https://colab.research.google.com/github/chaitjo/geometric-gnn-dojo/blob/main/geometric_gnn_101.ipynb")

print("\nThe conda envrionment for this project will enable you to run the code from this tutorial on the gpu's provided\n")

print("Only the intro and Part 0 of the tutorial is appliciable for this project, as the present data does not contain") 
print("molecular geometry information.\n")

 
We have 861 molecular graphs to train with.
Please adhere to best practices during training.

The networkx graphs in the graph_data.json file are lablled by SMILES strings
For more details on SMILES, visit: https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system

Below is an example of the graph nodes called by the SMILE string C[CH2-].C[CH2-].[Zn+2].

Graph Nodes:
Node 0: {'atom_type': 'C', 'formal_charge': 0, 'orbitals': [-1], 'binding_energies': [-1]}
Node 1: {'atom_type': 'C', 'formal_charge': -1, 'orbitals': [-1], 'binding_energies': [-1]}
Node 2: {'atom_type': 'C', 'formal_charge': 0, 'orbitals': [-1], 'binding_energies': [-1]}
Node 3: {'atom_type': 'C', 'formal_charge': -1, 'orbitals': [-1], 'binding_energies': [-1]}
Node 4: {'atom_type': 'Zn', 'formal_charge': 2, 'orbitals': ['3d5/2'], 'binding_energies': [16.709]}

Graph Edges:
Edge (0, 1): {'bond_type': 'SINGLE'}
Edge (2, 3): {'bond_type': 'SINGLE'}
 
Let's examine the organization of the nodes by printin