Usecase test 1: demonstrating that genes given in an article's abstract are found to be amoung the highest significance genes linked to that article in the KG. 

1. Retrieving all named genes from the 10 articles first retrieved from Pubmed (from the stored metadata file)

In [9]:
#import scispacy
import spacy
import csv
import re
spacy.load("en_core_web_sm")

#retrieve data from the metadata file
def get_csv_column(file_path, column_name):
    data = []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for row in csvreader:
            data.append(row[column_name])
    return data


def process_csv(file_path, search_terms, search_column, return_column):
    nlp = spacy.load("en_core_web_sm")
    results = []
    search_terms_lower = [term.lower() for term in search_terms]
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            search_text = row[search_column]
            return_value = row[return_column]
            doc = nlp(search_text)
            matched_terms = set(token.text for token in doc 
                                if token.text.lower() in search_terms_lower 
                                and not token.is_punct
                                and token.pos_ != "VERB")
            if matched_terms:
                results.append({
                    'matched_terms': list(matched_terms),
                    'return_value': return_value
                })
    return results


abstract_file = 'data/asd_article_metadata.csv'
gene_file = 'gene_list.csv'     #list of gene names taken from Ensembl 
column_name = 'Gene name'
gene_list = get_csv_column(gene_file, column_name)
search_column = 'abstract'      #will search through the columns of abstracts
return_column = 'pmid'           #will return the associated article pmid

results = process_csv(abstract_file, gene_list, search_column, return_column)

for result in results:
    print(f"Article PMID: {result['return_value']}")
    print(f"Matched gene terms: {result['matched_terms']}")
    
    print()

Article PMID: 36213201
Matched gene terms: ['set']

Article PMID: 36688057
Matched gene terms: ['CD14']

Article PMID: 37381037
Matched gene terms: ['ATRX', 'set', 'TEs', 'impact']

Article PMID: 38781372
Matched gene terms: ['GRN']



Visual check - only correct output is: 37381037 - contains ATRX


2. Querying the KG to retrieve the most significant genes within the datasets for each article: 

In [3]:
import rdflib
import re

filename = "main_graph.nt"
g = rdflib.Graph()
g.parse(filename, format="nt")

#query to return all data-rows with a pvalue and gene
query = """
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>

    SELECT ?subject ?gene ?value
    WHERE {
        ?subject EDAM:data_2082 ?value .
        ?subject BIOLINK:symbol | ENSEMBL:id | NCBIGENE:id ?gene .
    }
"""
#will add logfold info too:
#        ?subject EDAM:data_3754 ?logfold .

results = g.query(query)
#print(f"Number of results: {len(list(results))}")


Number of results: 29082


In [9]:
from collections import defaultdict
from decimal import Decimal, getcontext

getcontext().prec = 6   #to set 6 sig. digits.

def find_root(g, node, visited=None):
    if visited is None:
        visited = set()
    visited.add(node)
    parents = list(g.subjects(predicate=None, object=node))
    if not parents:
        return node
    for parent in parents:
        if parent not in visited:
            return find_root(g, parent, visited)
    return node

# store genes and p-values for each root(article)
root_data = defaultdict(list)

for row in results:
    try:
        value = Decimal(row.value)
        if value < Decimal('0.05'):
            subject = row.subject
            root = find_root(g, subject)
            root_data[root].append((row.gene, value))
    except (ValueError, InvalidOperation):
        pass

for root, gene_data in root_data.items():
    print(f"Article: {root}")
    for gene, pvalue in sorted(gene_data, key=lambda x: x[1])[:10]:  # Sort by p-value and return first 10
        print(f"  Gene: {gene}, P-value: {pvalue:.6f}")  # Format to 6 decimal places
    print()

Article: https://www.ncbi.nlm.nih.gov/pmc/articles/PMID36323788
  Gene: RNF157, P-value: 0.000000
  Gene: TANC2.4, P-value: 0.000000
  Gene: PCP4, P-value: 0.000000
  Gene: HCN1, P-value: 0.000000
  Gene: SNCA, P-value: 0.000000
  Gene: SCN1B, P-value: 0.000000
  Gene: IQCJ-SCHIP1.2, P-value: 0.000000
  Gene: WNK2, P-value: 0.000000
  Gene: AC113383.1, P-value: 0.000000
  Gene: ARHGAP10, P-value: 0.000000

Article: https://www.ncbi.nlm.nih.gov/pmc/articles/PMID31097668
  Gene: ENSG00000074410, P-value: 0.005398
  Gene: ENSG00000169515, P-value: 0.005398
  Gene: ENSG00000101493, P-value: 0.009701
  Gene: ENSG00000102445, P-value: 0.009701
  Gene: ENSG00000120742, P-value: 0.009701
  Gene: ENSG00000158966, P-value: 0.009706
  Gene: ENSG00000137693, P-value: 0.009706
  Gene: ENSG00000258655, P-value: 0.009706
  Gene: ENSG00000085719, P-value: 0.009811
  Gene: ENSG00000153208, P-value: 0.009987



(now can repeat with more articles and included conversion between gene formats (and logfold figure))