Usecase test 3: Can links be found relating to predispositions for ASD co-morbidities?

1. Take count of genes cited in the data across all articles highlighting the most cited overall.

In [14]:
import rdflib

filename = "main_graph.nt"
g = rdflib.Graph()
g.parse(filename, format="nt")
count_query = """
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>

    SELECT ?gene (COUNT(?subject) AS ?count)
    WHERE {
        ?subject BIOLINK:symbol ?gene .
    }
    GROUP BY ?gene
    ORDER BY DESC(?count)
    LIMIT 10
"""

results = g.query(count_query)
print("Gene | Count")
print("-----|------")
for row in results:
    gene = str(row['gene'])
    count = int(row['count'].toPython()) 
    print(f"{gene} | {count}")

Gene | Count
-----|------
CIRBP | 39
NRXN1 | 28
MGAT4C | 26
RAB3C | 25
BCYRN1 | 25
KCND2 | 23
MALAT1 | 23
ERBB4 | 22
PTMA | 21
KAZN | 21


2. Query external graph using SPARQL endpoint using these top-cited genes for related phenotypes 

In [36]:
import requests
import pandas as pd
from collections import defaultdict

def get_gene_id(gene_name):
    search_url = "https://api-v3.monarchinitiative.org/v3/api/search"
    params = {
        "q": gene_name,
        "category": "biolink:Gene",
        "limit": 1
    }
    try:
        response = requests.get(search_url, params=params)
        response.raise_for_status()
        data = response.json()
        if data['items']:
            return data['items'][0]['id']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while searching for {gene_name}: {e}")
    return None

def get_associations(gene_id, association_type):
    api_url = "https://api-v3.monarchinitiative.org/v3/api/association"
    params = {
        "subject": gene_id,
        "predicate": association_type,
        "offset": 0,
        "limit": 100
    }
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        data = response.json()
        return data.get('items', [])
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []

def get_gene_info(gene_name):
    gene_id = get_gene_id(gene_name)
    if not gene_id:
        print(f"Could not find ID for gene {gene_name}")
        return {}
    
    phenotypes = get_associations(gene_id, "biolink:has_phenotype")
    diseases = get_associations(gene_id, "biolink:gene_associated_with_condition")
    processes = get_associations(gene_id, "biolink:participates_in")
    functions = get_associations(gene_id, "biolink:enables")
    
    return {
        'Phenotypes': [item.get('object_label', 'Unknown') for item in phenotypes],
        'Diseases': [item.get('object_label', 'Unknown') for item in diseases],
        'Biological Processes': [item.get('object_label', 'Unknown') for item in processes],
        'Molecular Functions': [item.get('object_label', 'Unknown') for item in functions]
    }

# List of genes to query
genes = ["NRXN1", "BRCA1", "MECP2", "CIRBP", "MGAT4C"]

results = {}
for gene in genes:
    results[gene] = get_gene_info(gene)

# Create a summary of shared associations
shared_associations = defaultdict(lambda: defaultdict(list))

for category in ['Phenotypes', 'Diseases', 'Biological Processes', 'Molecular Functions']:
    all_items = set()
    for gene, data in results.items():
        all_items.update(data.get(category, []))
    
    for item in all_items:
        genes_with_item = [gene for gene, data in results.items() if item in data.get(category, [])]
        if len(genes_with_item) > 1:
            shared_associations[category][item] = genes_with_item

# Print the results
for category, items in shared_associations.items():
    print(f"\n{category}:")
    if items:
        df = pd.DataFrame([(item, ', '.join(genes)) for item, genes in items.items()], 
                          columns=[category, 'Genes'])
        print(df.to_string(index=False))
    else:
        print("No shared associations found.")

# Print unique associations for each gene
print("\nUnique associations for each gene:")
for gene, data in results.items():
    print(f"\n{gene}:")
    for category, items in data.items():
        unique_items = [item for item in items if len(shared_associations[category].get(item, [])) <= 1]
        if unique_items:
            print(f"  {category}:")
            for item in unique_items:
                print(f"    - {item}")

#need to amend to remove duplicates


Phenotypes:
                     Phenotypes               Genes
                      Hypotonia        NRXN1, MECP2
Autosomal recessive inheritance        NRXN1, BRCA1
                 Cryptorchidism        BRCA1, MECP2
                   Constipation NRXN1, BRCA1, MECP2
                  Hyperreflexia        BRCA1, MECP2
                Lymphadenopathy        BRCA1, MECP2
        Gastroesophageal reflux        NRXN1, MECP2
     Global developmental delay        BRCA1, MECP2
                         Ataxia        BRCA1, MECP2

Molecular Functions:
Molecular Functions                      Genes
        RNA binding        BRCA1, MECP2, CIRBP
  metal ion binding              NRXN1, MGAT4C
    protein binding NRXN1, BRCA1, MECP2, CIRBP

Unique associations for each gene:

NRXN1:
  Phenotypes:
    - Generalized hypotonia
    - Wide mouth
    - Broad-based gait
    - Feeding difficulties
    - Developmental regression
    - Protruding tongue
    - Drooling
    - Pulmonic stenosis
    - Stra

An error occurred: byte indices must be integers or slices, not str




In [19]:
from SPARQLWrapper import SPARQLWrapper, JSON
import json

top_genes = [str(row['gene']) for row in results]
print("Top 10 genes:", top_genes)
monarch_endpoint = "https://monarchinitiative.org/sparql"
sparql = SPARQLWrapper(monarch_endpoint)

gene_filter = " || ".join([f'CONTAINS(LCASE(STR(?gene)), LCASE("{gene}"))' for gene in top_genes])

monarch_query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX obo: <http://purl.obolibrary.org/obo/>

    SELECT DISTINCT ?gene ?phenotype ?disease 
    WHERE {{
        ?gene rdfs:label ?geneName ;
        obo:RO_0002200 ?phenotype .
        ?phenotype rdfs:label ?phenotypeName .
        OPTIONAL {{
            ?gene obo:RO_0003303 ?disease .
            ?disease rdfs:label ?diseaseName .
        }}
        FILTER({gene_filter})
    }}
    LIMIT 10
"""

# http://purl.obolibrary.org/obo/RO_0002200 - has phenotype
# http://purl.obolibrary.org/obo/RO_0002331 - involved in
# http://purl.obolibrary.org/obo/RO_0003303 - causes condition
# http://purl.obolibrary.org/obo/RO_0002327 - enables
# 

sparql.setQuery(monarch_query)
sparql.setReturnFormat(JSON)

try:
    monarch_results = sparql.query().convert()
    
    # Check if monarch_results is bytes (which would cause the TypeError)
    if isinstance(monarch_results, bytes):
        monarch_results = json.loads(monarch_results.decode('utf-8'))
    
    print("\nResults from Monarch Initiative:")
    if "results" in monarch_results and "bindings" in monarch_results["results"]:
        for result in monarch_results["results"]["bindings"]:
            gene = result.get("geneName", {}).get("value", "N/A")
            phenotype = result.get("phenotypeName", {}).get("value", "N/A")
            disease = result.get("diseaseName", {}).get("value", "N/A")
            print(f"Gene: {gene} | Phenotype: {phenotype} | Disease: {disease}")
    else:
        print("No results found or unexpected result format.")
        print("Raw results:", monarch_results)
except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Raw results:", monarch_results)


Top 10 genes: ['CIRBP', 'NRXN1', 'MGAT4C', 'RAB3C', 'BCYRN1', 'KCND2', 'MALAT1', 'ERBB4', 'PTMA', 'KAZN']
An error occurred: Expecting value: line 1 column 1 (char 0)
Raw results: b'<!doctype html>\n<html lang="en-US">\n  <head>\n    <!-- basic -->\n    <meta charset="utf-8" />\n    <meta name="viewport" content="width=device-width,initial-scale=1.0" />\n\n    <!-- favicons -->\n    <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png" />\n    <link\n      rel="icon"\n      type="image/png"\n      sizes="512x512"\n      href="/favicon-512x512.png"\n    />\n    <link rel="manifest" href="/site.webmanifest" />\n    <link rel="mask-icon" href="/safari-pinned-tab.svg" color="#00838f" />\n    <meta name="msapplication-TileColor" content="#00838f" />\n    <meta name="theme-color" content="#ffffff" />\n\n    <!-- basic -->\n    <title>Monarch Initiative</title>\n    <meta name="title" content="Monarch Initiative" />\n    <meta name="description" content="The Monarch Initia

In [None]:
"""monarch_results = sparql.query().convert()
print("Gene | Phenotype | Disease")
print("-----|-----------|--------")
for row in monarch_results:
    print(row)    
"""    
    
"""gene = str(row['gene'])
    geneName = str(row['geneName']) 
    phenotype = str(row['phenotype']) 
    phenotypeName = str(row['phenotypeName']) 
    disease = str(row['disease']) 
    diseaseName = str(row['diseaseName'])
    print(f"{gene} | {geneName} | {phenotype} | {phenotypeName} | {disease} | {diseaseName}")
"""

"""print("\nResults from Monarch Initiative:")
for result in monarch_results["results"]["bindings"]:
    gene = result["geneName"]["value"]
    phenotype = result["phenotypeName"]["value"]
    disease = result.get("diseaseName", {}).get("value", "N/A")
    print(f"Gene: {gene} | Phenotype: {phenotype} | Disease: {disease}")"""

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Define the SPARQL endpoint URL
endpoint_url = "https://monarchinitiative.org/sparql/"

# Create a SPARQLWrapper object
sparql = SPARQLWrapper(endpoint_url)

# Define your SPARQL query
query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX monarch: <https://monarchinitiative.org/gene/>

SELECT ?gene ?geneName ?phenotype ?phenotypeName
WHERE {
  ?gene rdfs:label ?geneName ;
        <https://monarchinitiative.org/vocabulary/has_phenotype> ?phenotype .
  ?phenotype rdfs:label ?phenotypeName .
  FILTER(CONTAINS(LCASE(STR(?geneName)), "sox4"))
}
LIMIT 10
"""

# Set the query and response format
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

# Execute the query and convert results to JSON
results = sparql.query().convert()

# Process the results
for result in results["results"]["bindings"]:
    gene = result["gene"]["value"]
    gene_name = result["geneName"]["value"]
    phenotype = result["phenotype"]["value"]
    phenotype_name = result["phenotypeName"]["value"]
    print(f"Gene: {gene_name} ({gene})")
    print(f"Phenotype: {phenotype_name} ({phenotype})")
    print("---")


import rdflib

g = rdflib.Graph()

for result in results["results"]["bindings"]:
    gene = rdflib.URIRef(result["gene"]["value"])
    phenotype = rdflib.URIRef(result["phenotype"]["value"])
    g.add((gene, rdflib.URIRef("https://monarchinitiative.org/vocabulary/has_phenotype"), phenotype))