In [1]:
import openai
import dotenv
import networkx as nx

configenv = dotenv.dotenv_values(".env")
openai_api_key = configenv["OPENAI_API_KEY"]
openai.api_key = openai_api_key

In [2]:
import openai
from scipy.spatial.distance import cosine
import copy
import networkx as nx
from pyvis.network import Network

# Function to get embeddings from OpenAI
def get_embedding(text, engine="text-embedding-ada-002"):
    response = openai.Embedding.create(input=[text], engine=engine)
    return response['data'][0]['embedding']

# Function to calculate similarity score
def calculate_similarity(text1, text2):
    # Get embeddings for the texts
    embedding1 = get_embedding(text1)
    embedding2 = get_embedding(text2)

    # Calculate cosine similarity and scale to 0-100
    similarity = (1 - cosine(embedding1, embedding2))
    return round(similarity, 2)

In [3]:
seed_word = "molecular modelling"

In [4]:
prompt = "Give me a python list of 10 strings (in one single line bounded by '[' and ']' without newline characters) related to the string {}:".format(seed_word)

In [5]:
response = openai.Completion.create(
  model="gpt-3.5-turbo-instruct",
  prompt=prompt,
  max_tokens=2000,
)

In [6]:
initial_generation = list(eval(response['choices'][0]['text'].strip(" \n")))

In [7]:
initial_generation

['protein',
 'ligand',
 'docking',
 'molecular dynamics',
 'force field',
 'simulation',
 'energy minimization',
 'conformation',
 'binding affinity',
 'structure prediction']

In [8]:
G = nx.DiGraph()
G.add_node(seed_word)
for w in initial_generation:
    G.add_node(w)
    weight = calculate_similarity(seed_word, w)
    G.add_edge(seed_word, w, weight=weight)

In [9]:
# Your initial setup code
termsL = copy.deepcopy(initial_generation)
iter_depth = 5
iterL = copy.deepcopy(initial_generation)

for i in range(iter_depth):
    for j in iterL:
        generationsL = []

        prompt = "Give me a python list of 5 strings (it must be enclosed in [ and ] and each string must be seperated by ,) related to '{}' in the context of '{}':".format(j, seed_word)
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=2000,
        )

        try:
            generationsL = list(eval(response['choices'][0]['text'].strip(" \n")))
        except:
            continue
        
        # print(generationsL)
        for k in generationsL:
            if k not in termsL:
                termsL.append(k)
                G.add_node(k)
                weight = calculate_similarity(j, k)
                G.add_edge(j, k, weight=weight)
            else:
                weight = calculate_similarity(j, k)
                G.add_edge(j, k, weight=weight)
        iterL = copy.deepcopy(generationsL)


In [10]:
# Network visualization with explicit addition of nodes and edges
nt = Network(notebook=True, cdn_resources='remote')

# Add nodes and edges to Pyvis network
for node in G.nodes:
    if node == seed_word:
        nt.add_node(node, label=node, color='red')
    else:
        nt.add_node(node, label=node)

for u, v, attr in G.edges(data=True):
    # weight = attr.get('weight', calculate_similarity(u, v))
    weight = attr.get('weight', 0.5)
    title = f'Similarity: {weight}'
    nt.add_edge(u, v, title=title)

nt.show("KG_vis.html")
# print(termsL)

KG_vis.html


In [11]:
len(G.nodes)

130

In [12]:
len(G.edges)

149

In [13]:
import csv
import networkx as nx

def export_graph_to_csv(G, file_path):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['from_node', 'to_node', 'weight'])  # Header
        for u, v, data in G.edges(data=True):
            writer.writerow([u, v, data.get('weight', 0)])  # Write edge data


In [14]:
export_graph_to_csv(G, "KG.csv")

In [15]:
def import_graph_from_csv(file_path):
    G = nx.Graph()
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            if len(row) == 3:
                from_node, to_node, weight = row
                G.add_edge(from_node, to_node, weight=float(weight))
    return G

# G = import_graph_from_csv("KG.csv")

In [16]:
import networkx as nx
import matplotlib.pyplot as plt

# Basic Graph Information
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Degree Analysis for Directed Graph
in_degrees = [degree for node, degree in G.in_degree()]
out_degrees = [degree for node, degree in G.out_degree()]
avg_in_degree = sum(in_degrees) / num_nodes
avg_out_degree = sum(out_degrees) / num_nodes

# Centrality Measures for Directed Graph
in_degree_centrality = nx.in_degree_centrality(G)
out_degree_centrality = nx.out_degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Most central node according to each centrality measure
most_central_by_in_degree = max(in_degree_centrality, key=in_degree_centrality.get)
most_central_by_out_degree = max(out_degree_centrality, key=out_degree_centrality.get)
most_central_by_betweenness = max(betweenness_centrality, key=betweenness_centrality.get)
most_central_by_closeness = max(closeness_centrality, key=closeness_centrality.get)

# Connected Components for Directed Graph
num_strongly_connected_components = nx.number_strongly_connected_components(G)
num_weakly_connected_components = nx.number_weakly_connected_components(G)

# Printing the insights
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)
print("Average in-degree:", avg_in_degree)
print("Average out-degree:", avg_out_degree)
print("Most central node by in-degree:", most_central_by_in_degree)
print("Most central node by out-degree:", most_central_by_out_degree)
print("Most central node by betweenness:", most_central_by_betweenness)
print("Most central node by closeness:", most_central_by_closeness)
print("Number of strongly connected components:", num_strongly_connected_components)
print("Number of weakly connected components:", num_weakly_connected_components)


Number of nodes: 130
Number of edges: 149
Average in-degree: 1.146153846153846
Average out-degree: 1.146153846153846
Most central node by in-degree: energy minimization
Most central node by out-degree: molecular modelling
Most central node by betweenness: homology modeling
Most central node by closeness: Homology modeling
Number of strongly connected components: 128
Number of weakly connected components: 1


In [17]:
import networkx as nx

# PageRank
pagerank = nx.pagerank(G)

# HITS Algorithm
hubs, authorities = nx.hits(G)

# Check if the graph is acyclic
is_acyclic = nx.is_directed_acyclic_graph(G)

# Edge Reciprocity (for directed graphs)
if G.is_directed():
    reciprocity = nx.overall_reciprocity(G)
else:
    reciprocity = "N/A"

# Assortativity
assortativity = nx.degree_assortativity_coefficient(G)

# Diameter and Average Shortest Path Length
# These are only meaningful for strongly connected components in directed graphs
# Make G a directed graph
G = G.to_directed()
if nx.is_strongly_connected(G):
    diameter = nx.diameter(G)
    avg_shortest_path_length = nx.average_shortest_path_length(G)
else:
    largest_scc = max(nx.strongly_connected_components(G), key=len)
    subgraph = G.subgraph(largest_scc)
    # Make G a directed graph
    subgraph = subgraph.to_directed()
    if nx.is_strongly_connected(subgraph):
        diameter = nx.diameter(subgraph)
        avg_shortest_path_length = nx.average_shortest_path_length(subgraph)
    else:
        diameter = "N/A"
        avg_shortest_path_length = "N/A"

# Printing the insights
print("PageRank:", pagerank)
print("Hubs:", hubs)
print("Authorities:", authorities)
print("Is the graph acyclic:", is_acyclic)
print("Edge Reciprocity:", reciprocity)
print("Assortativity:", assortativity)
print("Diameter of largest strongly connected component:", diameter)
print("Average shortest path length of largest strongly connected component:", avg_shortest_path_length)


PageRank: {'molecular modelling': 0.006324189619668186, 'protein': 0.006844001433988488, 'ligand': 0.006863741123139892, 'docking': 0.006870321019523692, 'molecular dynamics': 0.008084384087816728, 'force field': 0.007787611560793762, 'simulation': 0.006870321019523692, 'energy minimization': 0.010419136009129611, 'conformation': 0.006870321019523692, 'binding affinity': 0.006844001433988488, 'structure prediction': 0.00685716122675609, 'Receptor': 0.007542055172431117, 'Docking': 0.007471249035642575, 'Binding Site': 0.0075137327177157, 'Pharmacophore': 0.00749957149035799, 'Dissociation Constant': 0.007428765353569448, 'protein docking': 0.007497238786033169, 'ligand docking': 0.008842291167365438, 'molecular docking': 0.008805265282616905, 'docking algorithm': 0.007497238786033169, 'docking simulation': 0.008669670904736033, 'force fields': 0.008859057513041361, 'simulation software': 0.007722739568102101, 'equations of motion': 0.007740005616848199, 'integrator algorithms': 0.00763

  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


In [18]:
# Sort nodes by PageRank score in descending order
sorted_nodes = sorted(G.nodes(), key=lambda x: pagerank[x], reverse=True)

# File path for the CSV
file_path = 'graph_metrics.csv'

# Writing to CSV
with open(file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Node', 'PageRank', 'Hub Score', 'Authority Score'])

    for node in sorted_nodes:
        writer.writerow([node, pagerank[node], hubs[node], authorities[node]])

print(f"Metrics written to {file_path} in descending order of PageRank score")

Metrics written to graph_metrics.csv in descending order of PageRank score
