Usecase test 1: demonstrating that genes given in an article's abstract are found to be amoung the highest significance genes linked to that article in the KG. 

1. Retrieving all named genes from the 30 articles first retrieved from Pubmed (from the stored metadata file)

In [1]:
#import scispacy
import spacy
import csv
import re
spacy.load("en_core_web_sm")

#retrieve data from the metadata file
def get_csv_column(file_path, column_name):
    data = []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter="\t")
        for row in csvreader:
            data.append(row[column_name])
    return data


def process_csv(file_path, search_terms, search_column, return_column):
    nlp = spacy.load("en_core_web_sm")
    results = []
    search_terms_lower = [term.lower() for term in search_terms]
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            search_text = row[search_column]
            return_value = row[return_column]
            doc = nlp(search_text)
            matched_terms = set(token.text for token in doc 
                                if token.text.lower() in search_terms_lower 
                                and not token.is_punct
                                and token.pos_ != "VERB")
            if matched_terms:
                results.append({
                    'matched_terms': list(matched_terms),
                    'return_value': return_value
                })
    return results


abstract_file = 'data/asd_article_metadata.csv'
gene_file = 'gene_ids.txt'     #list of gene names taken from Ensembl 
column_name = 'Approved symbol'
gene_list = get_csv_column(gene_file, column_name)
search_column = 'abstract'      #will search through the columns of abstracts
return_column = 'pmid'           #will return the associated article pmid

results = process_csv(abstract_file, gene_list, search_column, return_column)

for result in results:
    print(f"Article PMID: {result['return_value']}")
    print(f"Matched gene terms: {result['matched_terms']}")
    
    print()

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


Article PMID: 28384108
Matched gene terms: ['CACNA1C', 'was', 'CACNB1']

Article PMID: 28485729
Matched gene terms: ['was', 'set', 'HDC', 'HRH4', 'HNMT', 'HRH1', 'HRH3', 'HRH2']

Article PMID: 29134693
Matched gene terms: ['MAA']

Article PMID: 30016992
Matched gene terms: ['set', 'PPP1R3F']

Article PMID: 30816183
Matched gene terms: ['Foxp2', 'was', 'Auts2']

Article PMID: 31719968
Matched gene terms: ['was']

Article PMID: 32015540
Matched gene terms: ['TCF7L2', 'Pten', 'TCF4']

Article PMID: 32365465
Matched gene terms: ['Mice', 'was', 'mice']

Article PMID: 33160303
Matched gene terms: ['was']

Article PMID: 33262327
Matched gene terms: ['ARID1B', 'ANK2', 'CHD8', 'SHANK3', 'NRXN2', 'ADNP2']

Article PMID: 34946850
Matched gene terms: ['set', 'TF']

Article PMID: 35710789
Matched gene terms: ['was']

Article PMID: 35962193
Matched gene terms: ['CIT']

Article PMID: 36213201
Matched gene terms: ['set']

Article PMID: 36688057
Matched gene terms: ['CD14']

Article PMID: 37381037
Matc

Visual check for correct outputs (nb, out of 30 articles, only these 6 named relevant key genes within the abstract. They often instead mention groups such as "circRNAs", "circQTLs" or broad categories such as "translational machinery"):

Article PMID: 28384108 - Matched gene terms: ['CACNB1', 'CACNA1C']

Article PMID: 28485729 - Matched gene terms: ['HRH4', 'HRH1', 'HRH3', 'HDC', 'HRH2', 'HNMT']

Article PMID: 30016992 - Matched gene terms: ['PPP1R3F']

Article PMID: 30816183 - Matched gene terms: ['Foxp2', 'Auts2']

Article PMID: 32015540 - Matched gene terms: ['Pten', 'TCF4']
nb. the article states "mutations in the TCF4 gene, but not the TCF7L2 gene"

Article PMID: 33262327 - Matched gene terms: ['NRXN2', 'ANK2', 'CHD8', 'ADNP2', 'SHANK3', 'ARID1B']


2. Querying the KG to retrieve the most significant genes within the datasets for each article: 

In [1]:
import rdflib
import re

filename = "cleaned_maingraph.nt"
g = rdflib.Graph()
g.parse(filename, format="nt")

#query to return all data-rows with a gene, a pvalue <0.05 and log >1
query = """
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?subject ?biolink_gene ?ensembl_gene ?ncbi_gene ?pvalue ?log
    WHERE {
        ?subject EDAM:data_2082 ?pvalue .
        ?subject EDAM:data_3754 ?log .
                FILTER(datatype(?pvalue) = xsd:double && ?pvalue < 0.05)
        FILTER(datatype(?log) = xsd:double && ?log > 1)
        OPTIONAL { ?subject BIOLINK:Gene ?biolink_gene }
        OPTIONAL { ?subject ENSEMBL:id ?ensembl_gene }
        OPTIONAL { ?subject NCBIGENE:id ?ncbi_gene }
        FILTER(BOUND(?biolink_gene) || BOUND(?ensembl_gene) || BOUND(?ncbi_gene))
    }
"""

In [2]:
results = g.query(query)

In [3]:
from collections import defaultdict

# List of valid PMIDs with genes names in the abstract
valid_roots = ['28384108', '28485729', '30016992', '30816183', '32015540', '33262327']

def find_root(g, node, visited=None):
    if visited is None:
        visited = set()
    visited.add(node)
    parents = list(g.subjects(predicate=None, object=node))
    if not parents:
        return node
    for parent in parents:
        if parent not in visited:
            return find_root(g, parent, visited)
    return node

# store genes and p-values for each root(article)
root_data = defaultdict(list)

for row in results:
    subject = row.subject
    root = find_root(g, subject)
    # Only add data if the article is in the valid_roots list
    if any(valid_root in str(root) for valid_root in valid_roots):
        genes = []
        if row.biolink_gene:
            genes.append(("BIOLINK:Gene", str(row.biolink_gene)))
        if row.ensembl_gene:
            genes.append(("ENSEMBL:id", str(row.ensembl_gene)))
        if row.ncbi_gene:
            genes.append(("NCBIGENE:id", str(row.ncbi_gene)))
        root_data[root].append((genes, row.pvalue, row.log))

for root, gene_data in root_data.items():
    print(f"Article: {root}")
    # Sort by absolute log value (descending) and return first 10
    for genes, pvalue, log in sorted(gene_data, key=lambda x: abs(x[2]), reverse=True)[:10]:
        gene_str = '; '.join([f"{predicate}: {gene}" for predicate, gene in genes])
        print(f"  Genes: {gene_str}")
        print(f"  P-value: {pvalue}, Log: {log}")
        print()
    print()

#need to repeat with  logfold figures
#        pvalue = Decimal(row.pvalue)
#        log_value = Decimal(row.log)  
#        if pvalue < Decimal('0.05'):

Article: https://pubmed.ncbi.nlm.nih.gov/33262327
  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16142
  P-value: 2.06e-149, Log: 3.154305941

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:24822
  P-value: 3.9876936469371696e-62, Log: 2.83301343344385

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16155
  P-value: 6.61e-48, Log: 2.785484554

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:25420
  P-value: 4.16089096317224e-65, Log: 2.6750822361374

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:20753
  P-value: 3.38268075696075e-58, Log: 2.60907437415787

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:16838
  P-value: 6.26389232809516e-97, Log: 2.58938552852585

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:6914
  P-value: 8.384530234413741e-45, Log: 2.54442640458582

  Genes: BIOLINK:Gene: https://monarchinitiative.org/HGNC:24615
  P-value: 1.1900000000000001e-57, Log: 2.489446544

  Genes: BIOLINK:Gene: ht

In [None]:
NRXN2', 'ANK2', 'CHD8', 'ADNP2', 'SHANK3', 'ARID1B