Usecase test 3: Can links be found relating to predispositions for ASD co-morbidities?

1. Take count of genes cited in the data across all articles highlighting the most cited overall (by most connections)

In [1]:
import rdflib

filename = "cleaned_maingraph.nt"
g = rdflib.Graph()
g.parse(filename, format="nt")
count_query = """
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>

    SELECT ?gene (COUNT(?subject) AS ?count)
    WHERE {
        ?subject BIOLINK:Gene ?gene .
    }
    GROUP BY ?gene
    ORDER BY DESC(?count)
    LIMIT 10
"""

results = g.query(count_query)
print("Gene | Count")
print("-----|------")
for row in results:
    gene = str(row['gene'])
    count = int(row['count'].toPython()) 
    print(f"{gene} | {count}")

Gene | Count
-----|------
CIRBP | 39
NRXN1 | 28
MGAT4C | 26
RAB3C | 25
BCYRN1 | 25
KCND2 | 23
MALAT1 | 23
ERBB4 | 22
PTMA | 21
KAZN | 21


In [None]:

import rdflib

filename = "cleaned_maingraph.nt"
g = rdflib.Graph()
g.parse(filename, format="nt")
count_query = """
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>

    SELECT ?gene (COUNT(DISTINCT ?subject) AS ?count)
    WHERE {
        {
            SELECT ?subject (SAMPLE(?gene_value) AS ?gene)
            WHERE {
                {
                    ?subject BIOLINK:Gene ?gene_value .
                }
                UNION
                {
                    ?subject BIOLINK:symbol ?gene_value .
                    FILTER NOT EXISTS { ?subject BIOLINK:Gene ?any_gene }
                }
                UNION
                {
                    ?subject ENSEMBL:id ?gene_value .
                    FILTER NOT EXISTS { 
                        ?subject BIOLINK:Gene ?any_gene 
                        UNION 
                        ?subject BIOLINK:symbol ?any_symbol
                    }
                }
                UNION
                {
                    ?subject NCBIGENE:id ?gene_value .
                    FILTER NOT EXISTS { 
                        ?subject BIOLINK:Gene ?any_gene 
                        UNION 
                        ?subject BIOLINK:symbol ?any_symbol
                        UNION
                        ?subject ENSEMBL:id ?any_ensembl
                    }
                }
            }
            GROUP BY ?subject
        }
    }
    GROUP BY ?gene
    ORDER BY DESC(?count)
"""

results = g.query(count_query)
print("Gene | Count")
print("-----|------")
for row in results:
    gene = str(row['gene'])
    count = int(row['count'].toPython()) 
    print(f"{gene} | {count}")
              

2. Query external Monarch Initiative graph using API using these top-cited genes for related phenotypes 

In [2]:
import requests
import pandas as pd
from collections import defaultdict

top_genes = [str(row['gene']) for row in results]   # using the most common genes from the KG above

#converting gene names to Monarch's gene number
def get_gene_id(gene_name):
    search_url = "https://api-v3.monarchinitiative.org/v3/api/search"
    params = {
        "q": gene_name,
        "category": "biolink:Gene",
        "limit": 1
    }
    try:
        response = requests.get(search_url, params=params)
        response.raise_for_status()
        data = response.json()
        if data['items']:
            return data['items'][0]['id']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while searching for {gene_name}: {e}")
    return None

#getting all associations from monarch api
def get_associations(gene_id, association_type):
    api_url = "https://api-v3.monarchinitiative.org/v3/api/association"
    params = {
        "subject": gene_id,
        "predicate": association_type,
        "offset": 0,
        "limit": 100
    }
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        data = response.json()
        return data.get('items', [])
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []

def get_gene_info(gene_name):
    gene_id = get_gene_id(gene_name)
    if not gene_id:
        print(f"Could not find ID for gene {gene_name}")
        return {}
    
    #using predicates to return relevant associations
    phenotypes = get_associations(gene_id, "biolink:has_phenotype")
    diseases = get_associations(gene_id, "biolink:gene_associated_with_condition")
    processes = get_associations(gene_id, "biolink:participates_in")
    functions = get_associations(gene_id, "biolink:enables")
    
    return {
        'Phenotypes': [item.get('object_label', 'Unknown') for item in phenotypes],
        'Diseases': [item.get('object_label', 'Unknown') for item in diseases],
        'Biological Processes': [item.get('object_label', 'Unknown') for item in processes],
        'Molecular Functions': [item.get('object_label', 'Unknown') for item in functions]
    }

results = {}
for gene in top_genes:
    results[gene] = get_gene_info(gene)




Molecular Functions:
Molecular Functions                                         Genes
    protein binding CIRBP, ERBB4, KAZN, KCND2, NRXN1, PTMA, RAB3C
  metal ion binding                          KCND2, MGAT4C, NRXN1

Unique associations for each gene:

CIRBP:
  Molecular Functions:
    - RNA binding
    - mRNA 3'-UTR binding
    - small ribosomal subunit rRNA binding
    - translation repressor activity

NRXN1:
  Phenotypes:
    - Autosomal recessive inheritance
    - Broad-based gait
    - Constipation
    - Developmental regression
    - Drooling
    - Epileptic encephalopathy
    - Feeding difficulties
    - Gastroesophageal reflux
    - Generalized hypotonia
    - Hyperventilation
    - Hypotonia
    - Intellectual disability, severe
    - Protruding tongue
    - Pulmonic stenosis
    - Scoliosis
    - Strabismus
    - Wide mouth
  Diseases:
    - NRXN1-related severe neurodevelopmental disorder-motor stereotypies-chronic constipation-sleep-wake cycle disturbance
  Biological P

In [None]:
"""
# Create a summary of shared associations between the given genes (using set to remove duplicates)
shared_associations = defaultdict(lambda: defaultdict(set))
for category in ['Phenotypes', 'Diseases', 'Biological Processes', 'Molecular Functions']:
    all_items = set()
    for gene, data in results.items():
        all_items.update(data.get(category, []))
    
    for item in all_items:
        genes_with_item = set(gene for gene, data in results.items() if item in data.get(category, []))
        if len(genes_with_item) > 1:
            shared_associations[category][item] = genes_with_item

for category, items in shared_associations.items():
    print(f"\n{category}:")
    if items:
        df = pd.DataFrame([(item, ', '.join(sorted(genes))) for item, genes in items.items()], 
                          columns=[category, 'Genes'])
        print(df.to_string(index=False))
    else:
        print("No shared associations found.")

# Print unique associations for each gene
print("\nUnique associations for each gene:")
for gene, data in results.items():
    print(f"\n{gene}:")
    for category, items in data.items():
        unique_items = set(item for item in items if len(shared_associations[category].get(item, [])) <= 1)
        if unique_items:
            print(f"  {category}:")
            for item in sorted(unique_items):
                print(f"    - {item}")
                """

In [None]:
from collections import defaultdict

counts = defaultdict(lambda: defaultdict(int))
for gene, info in results.items():
    for category, items in info.items():
        counts[category][gene] = len(items)

categories = list(counts.keys())
genes = list(results.keys())
x = np.arange(len(categories))
width = 0.8 / len(genes)

fig, ax = plt.subplots(figsize=(12, 6))
for i, gene in enumerate(genes):
    gene_counts = [counts[category][gene] for category in categories]
    ax.bar(x + i*width, gene_counts, width, label=gene)

ax.set_ylabel('Count')
ax.set_title('Gene Associations by Category')
ax.set_xticks(x + width * (len(genes) - 1) / 2)
ax.set_xticklabels(categories)
ax.legend(title='Genes', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()