Usecase test 1: demonstrating that genes given in an article's abstract are found to be amoung the highest significance genes linked to that article in the KG, and that other highly significant genes are not mentioned in the full text. 

1. Retrieving all named genes from the 30 articles first retrieved from Pubmed (from the stored metadata file)

In [3]:
#import scispacy
import spacy
import csv
import re
spacy.load("en_core_web_sm")

#retrieve data from the metadata file
def get_csv_column(file_path, column_name):
    data = []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter="\t")
        for row in csvreader:
            data.append(row[column_name])
    return data


def process_csv(file_path, search_terms, search_column, return_column):
    nlp = spacy.load("en_core_web_sm")
    results = []
    search_terms_lower = [term.lower() for term in search_terms]
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            search_text = row[search_column]
            return_value = row[return_column]
            doc = nlp(search_text)
            matched_terms = set(token.text for token in doc 
                                if token.text.lower() in search_terms_lower 
                                and not token.is_punct
                                and token.pos_ != "VERB")
            if matched_terms:
                results.append({
                    'matched_terms': list(matched_terms),
                    'return_value': return_value
                })
    return results


abstract_file = '../data/asd_article_metadata.csv'
gene_file = '../gene_ids.txt'     #list of gene names taken from Ensembl 
column_name = 'Gene name'
gene_list = get_csv_column(gene_file, column_name)
search_column = 'abstract'      #will search through the columns of abstracts
return_column = 'pmid'           #will return the associated article pmid

results = process_csv(abstract_file, gene_list, search_column, return_column)

for result in results:
    print(f"Article PMID: {result['return_value']}")
    print(f"Matched gene terms: {result['matched_terms']}")
    
    print()

Article PMID: 20868653
Matched gene terms: ['SHANK3', 'SHANK2', 'PTCHD1', 'NRXN1', 'HEY1', 'SOX9', 'NLGN3']

Article PMID: 26186191
Matched gene terms: ['FOXG1']

Article PMID: 26687839
Matched gene terms: ['Rbfox1']

Article PMID: 27439572
Matched gene terms: ['was', 'mTOR', 'SLC9A9']

Article PMID: 30111840
Matched gene terms: ['CPEB1', 'CPEB4']

Article PMID: 31500805
Matched gene terms: ['PI4KA', 'SNAP29', 'DGCR8', 'HIRA']

Article PMID: 33157009
Matched gene terms: ['NCKAP1', 'Nckap1']

Article PMID: 33160303
Matched gene terms: ['was']

Article PMID: 33262327
Matched gene terms: ['SHANK3', 'ADNP2', 'ANK2', 'CHD8', 'NRXN2', 'ARID1B']

Article PMID: 33587235
Matched gene terms: ['LRRC2', 'was', 'PRKAR2A']

Article PMID: 34535545
Matched gene terms: ['ADAR']

Article PMID: 34946850
Matched gene terms: ['set', 'TF']

Article PMID: 35710789
Matched gene terms: ['was']

Article PMID: 35962193
Matched gene terms: ['CIT']

Article PMID: 36213201
Matched gene terms: ['set']

Article PMID:

Visual check for correct outputs - many without genes; they often instead mention groups such as "circRNAs", "circQTLs" or broad categories such as "translational machinery"):

Article PMID: 20868653 - Matched gene terms: ['SHANK3', 'SHANK2', 'PTCHD1', 'NRXN1', 'HEY1', 'SOX9', 'NLGN3']
Article PMID: 26186191 - Matched gene terms: ['FOXG1']
Article PMID: 26687839 - Matched gene terms: ['Rbfox1']
Article PMID: 27439572 - Matched gene terms: ['mTOR', 'SLC9A9']
Article PMID: 30111840 - Matched gene terms: ['CPEB1', 'CPEB4']
Article PMID: 31500805 - Matched gene terms: ['PI4KA', 'SNAP29', 'DGCR8', 'HIRA']
Article PMID: 33157009 - Matched gene terms: ['NCKAP1']
Article PMID: 33262327 - Matched gene terms: ['SHANK3', 'ADNP2', 'ANK2', 'CHD8', 'NRXN2', 'ARID1B']
Article PMID: 33587235 - Matched gene terms: ['LRRC2', 'PRKAR2A']
Article PMID: 34535545 - Matched gene terms: ['ADAR']
Article PMID: 28384108 - Matched gene terms: ['CACNB1', 'CACNA1C']
Article PMID: 28485729 - Matched gene terms: ['HRH4', 'HRH1', 'HRH3', 'HDC', 'HRH2', 'HNMT']
Article PMID: 33262327 - Matched gene terms: ['NRXN2', 'ANK2', 'CHD8', 'ADNP2', 'SHANK3', 'ARID1B']


2. Querying the KG to retrieve the most significant genes within the datasets for each article: 

In [1]:
import rdflib

filename = "updated_ntriples_file.nt"
g = rdflib.Graph()
g.parse(filename, format="nt")

<Graph identifier=Nbe1a4716745642c3a725bd2acbec7c00 (<class 'rdflib.graph.Graph'>)>

In [4]:

import re

#query to return all data-rows with a gene, a pvalue <0.05 and log >1
query = """
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?subject ?gene ?pvalue ?log
    WHERE {
        ?subject BIOLINK:Gene ?gene .
        ?subject EDAM:data_2082 ?pvalue .
        ?subject EDAM:data_3754 ?log .
        FILTER(datatype(?pvalue) = xsd:double && ?pvalue < 0.05)
        FILTER(datatype(?log) = xsd:double && ABS(?log) > 1)
    }
"""
results = g.query(query)

In [7]:
#Retreiving the significant genes with greatest log fold change from the relevant datasets (all genes included)

from collections import defaultdict

# List of valid PMIDs with genes names in the abstract
valid_roots = ['33262327']

def find_root(g, node, visited=None):
    if visited is None:
        visited = set()
    visited.add(node)
    parents = list(g.subjects(predicate=None, object=node))
    if not parents:
        return node
    for parent in parents:
        if parent not in visited:
            return find_root(g, parent, visited)
    return node

# store genes and p-values for each root(article)
root_data = defaultdict(list)

for row in results:
    subject = row.subject
    root = find_root(g, subject)
    # Only add data if the article is in the valid_roots list
    if any(valid_root in str(root) for valid_root in valid_roots):
        genes = []
        if row.gene:
            genes.append(("BIOLINK:Gene", str(row.gene)))
        root_data[root].append((genes, row.pvalue, row.log))

for root, gene_data in root_data.items():
    print(f"Article: {root}")
    # Sort by absolute log value (descending) and return first 10
    for genes, pvalue, log in sorted(gene_data, key=lambda x: abs(x[2]), reverse=True)[:20]:
        gene_str = '; '.join([f"{predicate}: {gene}" for predicate, gene in genes])
        print(f"  Genes: {gene_str}")
        print(f"  P-value: {pvalue}, Log: {log}")
        print()
    print()



Article: https://pubmed.ncbi.nlm.nih.gov/33262327
  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16142
  P-value: 2.06e-149, Log: 3.154305941

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:25700
  P-value: 3.9876936469371696e-62, Log: 2.83301343344385

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16155
  P-value: 6.61e-48, Log: 2.785484554

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:29576
  P-value: 3.69e-56, Log: 2.7497335

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:25420
  P-value: 4.16089096317224e-65, Log: 2.6750822361374

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:20753
  P-value: 3.38268075696075e-58, Log: 2.60907437415787

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16838
  P-value: 6.26389232809516e-97, Log: 2.58938552852585

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:6914
  P-value: 8.384530234413741e-45, Log: 2.54442640458582

  Genes: BIOLINK:Gene: https://monarchini

In [11]:
#confirming number of rows of data matches length of original xlsx file
# rowcount_query = """
    PREFIX EDAM: <http://edamontology.org/>

    SELECT (COUNT(?hasOutput) AS ?count)
    WHERE {
        ?article ?p1 ?intermediary .
        ?intermediary EDAM:has_output ?hasOutput .
        FILTER (?article = <https://pubmed.ncbi.nlm.nih.gov/33262327>)
    }
"""

result = g.query(rowcount_query)

for row in result:
    print(f"Number of predicates: {row['count']}")

#Number of predicates: 2370

Number of predicates: 2370


In [9]:
#Retreiving the significant genes with greatest log fold change ONLY including SFARI Genes

from collections import defaultdict
import csv
import re

sfari_status = {}
with open('../sfari_gene.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        hgnc_id = row['HGNC ID'].strip()
        gene_score = row['gene-score'].strip()
        if gene_score and gene_score.isdigit():
            sfari_status[hgnc_id] = int(gene_score)
        else:
            sfari_status[hgnc_id] = None 

def extract_hgnc_id(gene_url):
    match = re.search(r'HGNC:\d+', gene_url)
    return match.group(0) if match else None

def is_valid_gene(gene_url):
    hgnc_id = extract_hgnc_id(gene_url)
    if hgnc_id in sfari_status:
        return hgnc_id in sfari_status and sfari_status[hgnc_id] not in [None, ""] and sfari_status[hgnc_id] in [1, 2]
    return False


for root, gene_data in root_data.items():
    print(f"Article: {root}")
    print(f"Number of gene entries: {len(gene_data)}")
    # Sort by absolute log value (descending) and return first 10
    for genes, pvalue, log in sorted(gene_data, key=lambda x: abs(x[2]), reverse=True)[:15]:
        gene_str = '; '.join([f"{predicate}: {gene}" for predicate, gene in genes])
        print(f"  Genes: {gene_str}")
        print(f"  P-value: {pvalue}, Log: {log}")
        print()
    print()

Article: https://pubmed.ncbi.nlm.nih.gov/33262327
Number of gene entries: 2028
  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16142
  P-value: 2.06e-149, Log: 3.154305941

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:25700
  P-value: 3.9876936469371696e-62, Log: 2.83301343344385

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16155
  P-value: 6.61e-48, Log: 2.785484554

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:29576
  P-value: 3.69e-56, Log: 2.7497335

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:25420
  P-value: 4.16089096317224e-65, Log: 2.6750822361374

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:20753
  P-value: 3.38268075696075e-58, Log: 2.60907437415787

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16838
  P-value: 6.26389232809516e-97, Log: 2.58938552852585

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:6914
  P-value: 8.384530234413741e-45, Log: 2.54442640458582

  Genes: BIO

In [11]:
import csv
from collections import defaultdict


gene_scores = {}
with open('../sfari_gene.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        gene_id = row[10]  
        gene_score = row[6]  
        gene_scores[gene_id] = gene_score

filtered_results = []

for root, gene_data in root_data.items():
    for genes, pvalue, log in sorted(gene_data, key=lambda x: abs(x[2]), reverse=True)[:20]:
        for predicate, gene in genes:
            # Extract gene ID if it starts with 'HGNC:'
            if gene.startswith('HGNC:'):
                gene_id = gene
                # Check if the gene ID is in the gene_scores dictionary
                if gene_id in gene_scores:
                    gene_score = int(gene_scores[gene_id])
                    # Add to filtered_results if gene_score is 1 or 2
                    if gene_score in [1, 2]:
                        filtered_results.append({
                            'Article': root,
                            'Gene': gene,
                            'P-value': pvalue,
                            'Log': log,
                            'Gene-Score': gene_score
                        })

# Print filtered results
for result in filtered_results:
    print(f"Article: {result['Article']}")
    print(f"  Gene: {result['Gene']}")
    print(f"  P-value: {result['P-value']}")
    print(f"  Log: {result['Log']}")
    print(f"  Gene-Score: {result['Gene-Score']}")
    print()

In [12]:
import csv
from collections import defaultdict

# Load gene-list.csv into a dictionary
gene_scores = {}
with open('../sfari_gene.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        gene_id = row[10] 
        gene_score = row[6]
        gene_scores[gene_id] = gene_score

# List of valid PMIDs with genes names in the abstract
valid_roots = ['33262327']

def find_root(g, node, visited=None):
    if visited is None:
        visited = set()
    visited.add(node)
    parents = list(g.subjects(predicate=None, object=node))
    if not parents:
        return node
    for parent in parents:
        if parent not in visited:
            return find_root(g, parent, visited)
    return node

# Store genes and p-values for each root (article)
root_data = defaultdict(list)

for row in results:
    subject = row.subject
    root = find_root(g, subject)
    # Only add data if the article is in the valid_roots list
    if any(valid_root in str(root) for valid_root in valid_roots):
        genes = []
        if row.gene:
            genes.append(("BIOLINK:Gene", str(row.gene)))
        root_data[root].append((genes, row.pvalue, row.log))

# Create a new list to store results based on gene scores
filtered_results = []

for root, gene_data in root_data.items():
    for genes, pvalue, log in sorted(gene_data, key=lambda x: abs(x[2]), reverse=True)[:20]:
        for predicate, gene in genes:
            # Extract gene ID if it starts with 'HGNC:'
            if gene.startswith('HGNC:'):
                gene_id = gene
                # Check if the gene ID is in the gene_scores dictionary
                if gene_id in gene_scores:
                    gene_score = int(gene_scores[gene_id])
                    # Add to filtered_results if gene_score is 1 or 2
                    if gene_score in [1, 2]:
                        filtered_results.append({
                            'Article': root,
                            'Gene': gene,
                            'P-value': pvalue,
                            'Log': log,
                            'Gene-Score': gene_score
                        })

# Print filtered results
for result in filtered_results:
    print(f"Article: {result['Article']}")
    print(f"  Gene: {result['Gene']}")
    print(f"  P-value: {result['P-value']}")
    print(f"  Log: {result['Log']}")
    print(f"  Gene-Score: {result['Gene-Score']}")
    print()
