# Extract relevant data from dbSNP

The purpose of this notebook is to extract relevant data about SNPs from the database.

## Notebook Requirements:
*  Model genes **must** have at least one of the following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"dbsnp"`
*  Note: Requires internet connection to download information from [dbSNP](https://www.ncbi.nlm.nih.gov/snp/).
*  Utilizes the [Bio.Entrez package](https://biopython.org/docs/latest/api/Bio.Entrez.html).

### Citations
Phan L, Zhang H, Wang Q, Villamarin R, Hefferon T, Ramanathan A, Kattman B. The evolution of dbSNP: 25 years of impact in genomic research. Nucleic Acids Res. 2025 Jan 6;53(D1):D925-D931. doi: 10.1093/nar/gkae977. PMID: 39530225; PMCID: PMC11701571.

## Setup
### Import packages

In [None]:
from collections import defaultdict
from xml.etree import ElementTree

import numpy as np
import pandas as pd
from Bio import Entrez
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
)
from rbc_gem_utils.util import has_value_type

show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Set notebook options

In [2]:
overwrite = True

# Make sure to use your own email!
email = "EMAILADDRESS"

## Load RBC-GEM model

In [3]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

Set parameter Username
Academic license - for non-commercial use only - expires 2025-11-21


0,1
Name,RBC_GEM
Memory address,14e1a8f10
Number of metabolites,2157
Number of reactions,3275
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


In [4]:
annotation_type = "genes"
df_model_mappings = get_annotation_df(
    getattr(model, annotation_type), ["dbsnp"]
).rename({"id": annotation_type}, axis=1)
for col in df_model_mappings.columns:
    df = (
        df_model_mappings[col]
        .apply(lambda x: split_string(x))
        .explode(col)
        .drop_duplicates()
    )
    print(f"{df.name}: {df.nunique()}")

df_model_mappings

genes: 820
dbsnp: 5070


Unnamed: 0,genes,dbsnp
0,RPE,
1,RPIA,rs121918591
2,SORD,rs145813597;rs1042079;rs930337;rs149975952
3,AKR7A2,rs6670759;rs1043657;rs859208;rs859210;rs223120...
4,SRM,rs1049932
...,...,...
815,USP5,
816,VCPIP1,
817,VPS4B,rs17688948
818,WDR77,rs7416672


### Get IDs for query from model

In [5]:
df_model_snps = df_model_mappings.loc[:, ["genes", "dbsnp"]].copy()
df_model_snps["dbsnp"] = df_model_mappings["dbsnp"].apply(split_string)
df_model_snps = df_model_snps.explode("dbsnp")
df_model_snps = df_model_snps.dropna().drop_duplicates()
rs_ids = df_model_snps["dbsnp"].unique()
df_model_snps

Unnamed: 0,genes,dbsnp
1,RPIA,rs121918591
2,SORD,rs145813597
2,SORD,rs1042079
2,SORD,rs930337
2,SORD,rs149975952
...,...,...
814,UFD1,rs17744624
817,VPS4B,rs17688948
818,WDR77,rs7416672
819,YES1,rs34580680


In [6]:
def fetch_results_dbSNP(email, rs_ids):
    Entrez.email = email
    search = Entrez.efetch(db="snp", retmode="text", id=rs_ids)
    all_lines = search.readlines()
    search.close()
    return all_lines

### Get results from SNP as a DataFrame

In [7]:
data = defaultdict(dict)
batch_size = 500
for batch_idx, batch in enumerate(np.arange(0, len(rs_ids), batch_size), start=0):
    query_ids = rs_ids[batch : batch + batch_size]
    print(
        f"Fetching results for batch {batch_idx + 1}  ({batch + len(query_ids)}/{len(rs_ids)})"
    )
    all_lines = fetch_results_dbSNP(email=email, rs_ids=query_ids)
    for entry_idx, xml_str in enumerate(all_lines, start=batch_idx * batch_size):
        entry = ElementTree.fromstring(xml_str)
        data[rs_ids[entry_idx]].update(
            {e.tag: e.text for e in entry if has_value_type(e)}
        )

# Make into DataFrame
df_snps_all = pd.DataFrame.from_dict(data, orient="index")
failed_ids = list(df_snps_all["error"].dropna().index)
print(f"\nNumber of failed IDs: {len(failed_ids)}")
df_snps_all.index.name = "dbsnp"
df_snps_all = df_snps_all[~df_snps_all["error"].notna()]
df_snps_all

Fetching results for batch 1  (500/5070)
Fetching results for batch 2  (1000/5070)
Fetching results for batch 3  (1500/5070)
Fetching results for batch 4  (2000/5070)
Fetching results for batch 5  (2500/5070)
Fetching results for batch 6  (3000/5070)
Fetching results for batch 7  (3500/5070)
Fetching results for batch 8  (4000/5070)
Fetching results for batch 9  (4500/5070)
Fetching results for batch 10  (5000/5070)
Fetching results for batch 11  (5070/5070)

Number of failed IDs: 3


Unnamed: 0_level_0,SNP_ID,GLOBAL_SAMPLESIZE,CLINICAL_SIGNIFICANCE,ACC,CHR,HANDLE,SPDI,FXN_CLASS,VALIDATED,DOCSUM,...,SS,ALLELE,SNP_CLASS,CHRPOS,CHRPOS_PREV_ASSM,SNP_ID_SORT,CLINICAL_SORT,CHRPOS_SORT,MERGED_SORT,error
dbsnp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rs121918591,121918591,0,pathogenic,NC_000002.12,2,"PAGE_CC,TOPMED,GNOMAD,OMIM-CURATED-RECORDS,ILL...",NC_000002.12:88729278:C:T,"coding_sequence_variant,missense_variant","by-frequency,by-alfa,by-cluster","HGVS=NC_000002.12:g.88729279C>T,NC_000002.11:g...",...,"275518424,1958438289,2732810607,3021993357,365...",Y,snv,2:88729279,2:89028797,0121918591,1,0088729279,0,
rs145813597,145813597,0,"pathogenic-likely-pathogenic,uncertain-signifi...",NC_000015.10,15,"TOPMED,EVA_EXAC,SWEGEN,HUGCELL_USP,ILLUMINA,GN...","NC_000015.10:45065302:C:A,NC_000015.10:4506530...","missense_variant,coding_sequence_variant,non_c...","by-frequency,by-alfa,by-cluster","HGVS=NC_000015.10:g.45065303C>A,NC_000015.10:g...",...,"342402496,464140601,491493966,491700043,991737...",H,snv,15:45065303,15:45357501,0145813597,1,0045065303,0,
rs1042079,1042079,0,benign,NC_000015.10,15,"ILLUMINA,BUSHMAN,GNOMAD,GMI,KHV_HUMAN_GENOMES,...","NC_000015.10:45068981:A:G,NC_000015.10:4506898...","missense_variant,non_coding_transcript_variant...","by-frequency,by-alfa,by-cluster","HGVS=NC_000015.10:g.45068982A>G,NC_000015.10:g...",...,"1509008,4403349,76874733,95213358,156480828,15...",D,snv,15:45068982,15:45361180,0001042079,1,0045068982,0,
rs930337,930337,0,benign,NC_000015.10,15,"ILLUMINA,SYSTEMSBIOZJU,GENOMICARE,BUSHMAN,EVA_...","NC_000015.10:45072335:A:C,NC_000015.10:4507233...","missense_variant,non_coding_transcript_variant...","by-frequency,by-alfa,by-cluster","HGVS=NC_000015.10:g.45072336A>C,NC_000015.10:g...",...,"1368774,2558685,3214527,5252928,21245444,24802...",H,snv,15:45072336,15:45364534,0000930337,1,0045072336,0,
rs149975952,149975952,0,benign,NC_000015.10,15,"WEILL_CORNELL_DGM,EVA_EXAC,ILLUMINA,SGDP_PRJ,G...","NC_000015.10:45073419:G:A,NC_000015.10:4507341...","missense_variant,coding_sequence_variant,non_c...","by-frequency,by-alfa,by-cluster","HGVS=NC_000015.10:g.45073420G>A,NC_000015.10:g...",...,"342402507,489048604,491493972,1353273008,16918...",D,snv,15:45073420,15:45365618,0149975952,1,0045073420,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs17744624,17744624,0,,NC_000022.11,22,"USC_VALOUEV,JJLAB,EXOME_CHIP,KRGDB,EVA_EXAC,GO...","NC_000022.11:19467906:G:A,NC_000022.11:1946790...","missense_variant,coding_sequence_variant","by-frequency,by-alfa,by-cluster","HGVS=NC_000022.11:g.19467907G>A,NC_000022.11:g...",...,"24701832,48413642,69277235,69376500,74818968,8...",N,snv,22:19467907,22:19455430,0017744624,0,0019467907,0,
rs17688948,17688948,0,,NC_000018.10,18,"EVA_UK10K_ALSPAC,EVA-GONL,EVA_DECODE,EVA_EXAC,...",NC_000018.10:63410411:G:C,"missense_variant,coding_sequence_variant,genic...","by-frequency,by-alfa,by-cluster","HGVS=NC_000018.10:g.63410412G>C,NC_000018.9:g....",...,"24492533,69217429,74820330,160453165,237511803...",S,snv,18:63410412,18:61077645,0017688948,0,0063410412,0,
rs7416672,7416672,0,,NC_000001.11,1,"TOPMED,SC_JCM,GNOMAD,ILLUMINA","NC_000001.11:111448776:C:A,NC_000001.11:111448...","missense_variant,2KB_upstream_variant,coding_s...","by-frequency,by-alfa,by-cluster","HGVS=NC_000001.11:g.111448777C>A,NC_000001.11:...",...,"11204945,160878283,482563863,2746395432,275957...",H,snv,1:111448777,1:111991399,0007416672,0,0111448777,0,
rs34580680,34580680,0,,NC_000018.10,18,"EVA_EXAC,KOGIC,GRF,PERLEGEN,GNOMAD,1000G_HIGH_...",NC_000018.10:745839:T:C,"coding_sequence_variant,missense_variant","by-frequency,by-alfa,by-cluster","HGVS=NC_000018.10:g.745840T>C,NC_000018.9:g.74...",...,"48533661,69204557,160622497,339897863,48170461...",Y,snv,18:745840,18:745840,0034580680,0,0000745840,0,


### Parse and reformat results

In [8]:
df_model_snps = df_model_mappings.loc[:, ["genes", "dbsnp"]].copy()
df_model_snps["dbsnp"] = df_model_mappings["dbsnp"].apply(split_string)
df_model_snps = df_model_snps.explode("dbsnp")
df_model_snps = df_model_snps.dropna().drop_duplicates()
rs_ids = df_model_snps["dbsnp"].unique()
df_model_snps

rename_mapping = {
    key: key.lower()
    for key in df_snps_all.columns
    if key not in {"error", "GLOBAL_SAMPLESIZE"}
}

df_model_snps = df_model_snps.merge(
    df_snps_all.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1),
    left_on="dbsnp",
    right_on="dbsnp",
    how="inner",
)
for col in df_model_snps.columns:
    df_model_snps[col] = df_model_snps[col].str.split(",")
    # Switch to default seperator (currently semicolon)
    df_model_snps[col] = df_model_snps[col].apply(lambda x: build_string(x))
df_model_snps = df_model_snps.sort_values(["snp_id_sort"]).reset_index(drop=True)
df_model_snps["sequence_change"] = (
    df_model_snps["spdi"]
    .dropna()
    .apply(lambda value: [s.split(":", 2)[-1] for s in split_string(value)])
)

if overwrite:
    df_model_snps.to_csv(get_dirpath("database") / f"snps_{GEM_NAME}.tsv", sep="\t")
df_model_snps

Unnamed: 0,genes,dbsnp,snp_id,clinical_significance,acc,chr,handle,spdi,fxn_class,validated,...,ss,allele,snp_class,chrpos,chrpos_prev_assm,snp_id_sort,clinical_sort,chrpos_sort,merged_sort,sequence_change
0,HBB,rs334,334,not-provided;protective;likely-benign;pathogen...,NC_000011.10,11,ILLUMINA;PERLEGEN;CLINSEQ_SNP;1000G_HIGH_COVER...,NC_000011.10:5227001:T:A;NC_000011.10:5227001:...,coding_sequence_variant;missense_variant,by-frequency;by-alfa;by-cluster,...,335;24811263;48419811;49850538;49850540;498505...,N,snv,11:5227002,11:5248232,0000000334,1,0005227002,0,"[T:A, T:C, T:G]"
1,AGT,rs699,699,risk-factor;benign,NC_000001.11,1,LEE;CGAP-GAI;EVA-GONL;WEILL_CORNELL_DGM;KRGDB;...,NC_000001.11:230710047:A:C;NC_000001.11:230710...,coding_sequence_variant;missense_variant,by-frequency;by-alfa;by-cluster,...,704;5428;6484;1510443;3240877;3474289;4404260;...,N,snv,1:230710048,1:230845794,0000000699,1,0230710048,0,"[A:C, A:G, A:T]"
2,GSTP1,rs1695,1695,benign;not-provided,NC_000011.10,11,SSMP;YEGNASUBRAMANIAN_LAB;PERLEGEN;CLINSEQ_SNP...,NC_000011.10:67585217:A:G;NC_000011.10:6758521...,coding_sequence_variant;missense_variant,by-frequency;by-alfa;by-cluster,...,1720;5330;9110;1390210;1545478;3178003;5586573...,D,snv,11:67585218,11:67352689,0000001695,1,0067585218,0,"[A:G, A:T]"
3,MPO,rs2759,2759,,NC_000017.11,17,1000GENOMES;UAEU_GENOMICS_LAB;EVA_MGP;EVA_DECO...,NC_000017.11:58270744:T:C,coding_sequence_variant;missense_variant,by-frequency;by-alfa;by-cluster,...,2789;16338615;23791841;38350281;69199282;74862...,Y,snv,17:58270745,17:56348106,0000002759,0,0058270745,0,[T:C]
4,FKBP2,rs4672,4672,,NC_000011.10,11,BIOINF_KMB_FNS_UNIBA;ILLUMINA;JJLAB;EVA_DECODE...,NC_000011.10:64242406:G:A;NC_000011.10:6424240...,intron_variant;coding_sequence_variant;3_prime...,by-frequency;by-alfa;by-cluster,...,1555019;4436250;20825221;24790172;39886556;752...,D,snv,11:64242407,11:64009879,0000004672,0,0064242407,0,"[G:A, G:T]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5062,CLCN3,rs2150238934,2150238934,,,,,,,,...,,,,,,2150238934,0,99999999999,0,
5063,CLCN3,rs2150254146,2150254146,,,,,,,,...,,,,,,2150254146,0,99999999999,0,
5064,CLCN3,rs2150267036,2150267036,,,,,,,,...,,,,,,2150267036,0,99999999999,0,
5065,CLCN3,rs2150274994,2150274994,,,,,,,,...,,,,,,2150274994,0,99999999999,0,
