### Guideline

* nct2doctors
* nct2citations
* nct2conditions
* cit2conditions

* nct2intenventions
* cit2intenventions

* nct2sponsors
* doctor2affiliations

### Prepare data

In [1]:
from graph_model_utils import *
import networkx as nx

In [3]:
nct2doctors, doctor2affiliations = get_nct2doctors_doctor2affiliation(False, './data/Facility_Investigators.csv')
nct2citations = get_nct2citations('./data/Study_References.csv')
nct2conditions = get_nct2conditions('./data/Browse_Conditions.csv')
all_conditions = get_all_conditions('./data/Browse_Conditions.csv')
all_citations = get_all_citations('./data/Study_References.csv')
cit2conditions = get_cit2conditions(all_citations, all_conditions, step=5000)
#nct2sponsors = get_nct2sponsors('./data/Sponsors.csv')

### Build graph

In [4]:
def build_graph(nct2doctors=False, nct2citations=False, nct2conditions=False, cit2conditions=False, 
                nct2interventions=False, cit2interventions=False, nct2sponsors=False, doctor2affiliation=False, ):
    G = nx.Graph()
    if nct2doctors:
        for nct, doctors in nct2doctors.items():
            G.add_node(nct, category='NCT')
            for doctor in doctors:
                G.add_node(doctor, category='DOC')
                G.add_edge(nct, doctor, weight=1)
    if nct2citations:
        for nct, citations in nct2citations.items():
            G.add_node(nct, category='NCT')
            for citation in citations:
                G.add_node(citation, category='CIT')
                G.add_edge(nct, citation, weight=1)
    if nct2conditions:
        for nct, conditions in nct2conditions.items():
            G.add_node(nct, category='NCT')
            for condition in conditions:
                G.add_node(condition, category='DIS')
                G.add_edge(nct, condition, weight=1)        
    if cit2conditions:
        for cit, conditions in cit2conditions.items():
            G.add_node(cit, category='CIT')
            for condition in conditions:
                G.add_node(condition, category='DIS')
                G.add_edge(cit, condition, weight=1) 
                
                
    if nct2sponsors:
        for nct, sponsors in nct2sponsors.items():
            G.add_node(nct, category='NCT')
            for sponsor in sponsors:
                G.add_node(sponsor, category='COM')
                G.add_edge(nct, sponsor, weight=1)

    if doctor2affiliation:
        for doctor, affiliations in doctor2affiliation.items():
            G.add_node(doctor, category='DOC')
            for affiliation in affiliations:
                G.add_node(affiliation, category='COM')
                G.add_edge(doctor, affiliation, weight=1)

    return G

In [5]:
G = build_graph(nct2doctors, nct2citations, nct2conditions, cit2conditions, False, False, False, False)

In [6]:
len(G.nodes())

650504

In [7]:
len(G.edges())

1014377

In [8]:
count = {'NCT':0, 'DIS':0, 'CIT':0, 'DOC':0}
for node in G.nodes():
    category = G.nodes[node]['category']
    count[category] += 1

In [9]:
count

{'NCT': 243658, 'DIS': 3875, 'CIT': 288917, 'DOC': 114054}

### Find similar trials

In [10]:
def filter_neighbors(graph, node, category):
    filtered_neighbors = set()
    for unit in graph[node]:
        if graph.nodes[unit]['category'] == category:
            filtered_neighbors.add(unit)
    return filtered_neighbors

def disjoint(a, b):
    c = set()
    c.update(a)
    c.update(b)
    return True if len(c) - len(a) - len(b) == 0 else False

In [11]:
def find_similar_units(graph, node, n_step, category):
    assert node in graph
    res, visited = set(), set()
    queue = [node]
    while n_step > 0:
        new_queue = []
        for unit in queue:
            visited.add(node)
            for item in graph[unit]:
                if item not in visited:
                    visited.add(item)
                    new_queue.append(item)
        queue = new_queue
        n_step -= 1
    
    for unit in visited:
        if G.nodes[unit]['category'] == category: #and \
        #disjoint(filter_neighbors(G, node, 'COM'), filter_neighbors(G, unit, 'COM')) :
            res.add(unit)
    return res

In [27]:
step2 = find_similar_units(G,'NCT03662698', 2, 'NCT')
step3 = find_similar_units(G,'NCT03662698', 4, 'NCT')

In [20]:
nct2conditions['NCT03662698']

{'head and neck neoplasms'}

In [21]:
nct2conditions['NCT03389477']

{'carcinoma', 'carcinoma, squamous cell', 'head and neck neoplasms'}