Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: expose similarity function of hpo crate (#7) #9

Merged
merged 6 commits into from
Jun 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ serde_json = "1.0"
serde_with = "3.0"
sha2 = "0.10"
shellexpand = "3.0"
strum = "0.24"
strum = { version = "0.24", features = ["strum_macros", "derive"] }
strum_macros = "0.24"
temp_testdir = "0.2"
tempdir = "0.3"
Expand Down
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ The number of simulations should be high for production (the default is 100k) bu
--num-simulations 10 \
--seed 42 \
--path-hpo-dir /tmp/data/hpo/hpo \
--path-out-rocksdb /tmp/data/hpo/hpo/resnik
--path-out-rocksdb /tmp/data/hpo/hpo/scores-fun-sim-avg-resnik-gene \
--combiner fun-sim-avg \
--similarity resnik \
--ic-base gene
```

## Running the Server
Expand All @@ -68,7 +71,7 @@ INFO try: http://127.0.0.1:8080/hpo/genes?gene_symbol=TGDS
INFO try: http://127.0.0.1:8080/hpo/genes?gene_id=23483&hpo_terms=true
INFO try: http://127.0.0.1:8080/hpo/omims?omim_id=616145&hpo_terms=true
INFO try: http://127.0.0.1:8080/hpo/terms?term_id=HP:0000023&genes=true
INFO try: http://127.0.0.1:8080/hpo/sim/term-term?lhs=HP:0001166,HP:0040069&rhs=HP:0005918,HP:0004188&sim=resnik::gene
INFO try: http://127.0.0.1:8080/hpo/sim/term-term?lhs=HP:0001166,HP:0040069&rhs=HP:0005918,HP:0004188
INFO try: http://127.0.0.1:8080/hpo/sim/term-gene?terms=HP:0001166,HP:0000098&gene_symbols=FBN1,TGDS,TTN
INFO starting 4 workers
INFO Actix runtime found; starting in Actix runtime
Expand All @@ -92,7 +95,7 @@ Note that we truncate the output JSON.
# curl 'http://127.0.0.1:8080/hpo/terms?term_id=HP:0000023&genes=true'
[{"term_id":"HP:0000023","name":"Inguinal hernia","genes":[{"gen...

# curl 'http://127.0.0.1:8080/hpo/sim/term-term?lhs=HP:0001166,HP:0040069&rhs=HP:0005918,HP:0004188&sim=resnik::gene'
# curl 'http://127.0.0.1:8080/hpo/sim/term-term?lhs=HP:0001166,HP:0040069&rhs=HP:0005918,HP:0004188'
[{"lhs":"HP:0001166","rhs":"HP:0005918","score":1.4280319,"sim":...
```

Expand Down
199 changes: 198 additions & 1 deletion src/common.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
//! Functionality shared between all commands.
//! Functionality shared across the crate.

use std::str::FromStr;

use clap::Parser;
use clap_verbosity_flag::{InfoLevel, Verbosity};
use hpo::{
similarity::{Builtins, StandardCombiner},
term::InformationContentKind,
};
use strum::{EnumIter, IntoEnumIterator};

/// Shared command line arguments.
#[derive(Parser, Debug)]
Expand Down Expand Up @@ -67,3 +74,193 @@ pub fn load_hpo<P: AsRef<std::path::Path>>(path: P) -> Result<hpo::Ontology, any
))?)
}
}

/// Enum for representing the information content kind.
///
/// We replicate what is in the `hpo` create so we can put them on the command line and use
/// them in HTTP queries more easily.
#[derive(
Default,
Debug,
Clone,
Copy,
EnumIter,
PartialEq,
Eq,
PartialOrd,
Ord,
derive_more::Display,
serde::Serialize,
serde::Deserialize,
)]
#[serde(rename_all = "kebab-case")]
pub enum IcBasedOn {
/// Compute information content based on gene.
#[default]
#[display(fmt = "gene")]
Gene,
/// Compute information content based on OMIM disease.
#[display(fmt = "omim")]
Omim,
}

impl FromStr for IcBasedOn {
type Err = anyhow::Error;

fn from_str(s: &str) -> Result<Self, Self::Err> {
IcBasedOn::iter()
.find(|m| m.to_string().as_str().eq(s))
.ok_or(anyhow::anyhow!("unknown information content base: {}", s))
}
}

/// Enum for representing similarity method to use.
///
/// We replicate what is in the `hpo` create so we can put them on the command line and use
/// them in HTTP queries more easily.
#[derive(
Default,
Debug,
Clone,
Copy,
EnumIter,
PartialEq,
Eq,
PartialOrd,
Ord,
derive_more::Display,
serde::Serialize,
serde::Deserialize,
)]
#[serde(rename_all = "kebab-case")]
pub enum SimilarityMethod {
/// "Distance" similarity.
#[display(fmt = "distance")]
DistanceGene,
/// Graph IC similarity.
#[display(fmt = "graph-ic")]
GraphIc,
/// Information coefficient similarity..
#[display(fmt = "information-coefficient")]
InformationCoefficient,
/// Jiang & Conrath similarity.
#[display(fmt = "jc")]
Jc,
/// Lin similarity..
#[display(fmt = "lin")]
Lin,
/// "Mutation" similarity.
#[display(fmt = "mutation")]
Mutation,
/// "Relevance" similarity.
#[display(fmt = "relevance")]
Relevance,
/// Resnik similarity..
#[default]
#[display(fmt = "resnik")]
Resnik,
}

/// Convert to pairwise similarity.
pub fn to_pairwise_sim(sim: SimilarityMethod, ic_based_on: IcBasedOn) -> Builtins {
let kind = match ic_based_on {
IcBasedOn::Gene => InformationContentKind::Gene,
IcBasedOn::Omim => InformationContentKind::Omim,
};
match sim {
SimilarityMethod::DistanceGene => Builtins::Distance(kind),
SimilarityMethod::GraphIc => Builtins::GraphIc(kind),
SimilarityMethod::InformationCoefficient => Builtins::InformationCoefficient(kind),
SimilarityMethod::Jc => Builtins::Jc(kind),
SimilarityMethod::Lin => Builtins::Lin(kind),
SimilarityMethod::Mutation => Builtins::Mutation(kind),
SimilarityMethod::Relevance => Builtins::Relevance(kind),
SimilarityMethod::Resnik => Builtins::Resnik(kind),
}
}

impl FromStr for SimilarityMethod {
type Err = anyhow::Error;

fn from_str(s: &str) -> Result<Self, Self::Err> {
SimilarityMethod::iter()
.find(|m| m.to_string().as_str().eq(s))
.ok_or(anyhow::anyhow!("unknown similarity method: {}", s))
}
}

/// Representation of the standard combiners from HPO.
///
/// We replicate what is in the `hpo` create so we can put them on the command line and use
/// them in HTTP queries more easily.
#[derive(
Default,
Debug,
Clone,
Copy,
EnumIter,
PartialEq,
Eq,
PartialOrd,
Ord,
derive_more::Display,
serde::Serialize,
serde::Deserialize,
)]
#[serde(rename_all = "kebab-case")]
pub enum ScoreCombiner {
/// funSimAvg algborithm.
#[default]
#[display(fmt = "fun-sim-avg")]
FunSimAvg,
/// funSimMax algorithm.
#[display(fmt = "fun-sim-max")]
FunSimMax,
/// BMA algorithm.
#[display(fmt = "bma")]
Bma,
}

impl From<ScoreCombiner> for StandardCombiner {
fn from(val: ScoreCombiner) -> Self {
match val {
ScoreCombiner::FunSimAvg => StandardCombiner::FunSimAvg,
ScoreCombiner::FunSimMax => StandardCombiner::FunSimMax,
ScoreCombiner::Bma => StandardCombiner::Bwa,
}
}
}

impl FromStr for ScoreCombiner {
type Err = anyhow::Error;

fn from_str(s: &str) -> Result<Self, Self::Err> {
ScoreCombiner::iter()
.find(|m| m.to_string().as_str().eq(s))
.ok_or(anyhow::anyhow!("unknown score combiner: {}", s))
}
}

/// The version of `viguno` package.
pub const VERSION: &str = env!("CARGO_PKG_VERSION");

/// Version information that is returned by the HTTP server.
#[derive(serde::Serialize, serde::Deserialize, Default, Debug, Clone)]
pub struct Version {
/// Version of the HPO.
pub hpo: String,
/// Version of the `viguno` package.
pub viguno: String,
}

impl Version {
/// Construct a new version.
///
/// The viguno version is filed automatically.
pub fn new(hpo: &str) -> Self {
Self {
hpo: hpo.to_string(),
viguno: VERSION.to_string(),
}
}
}
62 changes: 44 additions & 18 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ use hpo::{annotations::AnnotationId, term::HpoGroup, HpoTermId, Ontology};
use crate::algos::phenomizer;
use crate::pbs::simulation::SimulationResults;
use crate::query::query_result::TermDetails;
use crate::server::actix_server::hpo_sim::term_gene::SimilarityMethod;
use crate::simulate::VERSION;

/// Command line arguments for `query` command.
#[derive(Parser, Debug)]
Expand Down Expand Up @@ -52,17 +50,33 @@ pub struct HpoTerm {
pub mod query_result {
use super::HpoTerm;

/// Struct for storing gene information in the result.
#[derive(
serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord, Debug, Clone,
)]
pub struct Gene {
/// The NCBI gene ID.
pub entrez_id: u32,
/// The gene symbol.
pub gene_symbol: String,
}

/// The performed query.
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
pub struct Query {
/// The query HPO terms.
pub terms: Vec<HpoTerm>,
/// The gene list to score.
pub genes: Vec<Gene>,
}

/// Result container data structure.
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
pub struct Container {
/// Version of the HPO.
pub hpo_version: String,
/// Version of the `varfish-server-worker` package.
pub varfish_version: String,
/// The scoring method used.
pub score_method: String,
/// Version information.
pub version: crate::common::Version,
/// The original query records.
pub query: Vec<HpoTerm>,
pub query: Query,
/// The resulting records for the scored genes.
pub result: Vec<Record>,
}
Expand Down Expand Up @@ -116,22 +130,20 @@ pub mod query_result {
/// In the case that a term or database lookup fails.
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_precision_loss)]
#[allow(clippy::too_many_lines)]
pub fn run_query(
patient: &HpoGroup,
genes: &Vec<&hpo::annotations::Gene>,
hpo: &Ontology,
db: &DBWithThreadMode<MultiThreaded>,
) -> Result<query_result::Container, anyhow::Error> {
let cf_resnik = db
.cf_handle("resnik_pvalues")
.expect("database is missing resnik_pvalues column family");
.cf_handle("scores")
.expect("database is missing 'scores' column family");

let num_terms = std::cmp::min(10, patient.len());
let mut result = query_result::Container {
hpo_version: hpo.hpo_version(),
varfish_version: VERSION.to_string(),
score_method: SimilarityMethod::Phenomizer.to_string(),
query: patient
let query = query_result::Query {
terms: patient
.iter()
.map(|t| {
let term = hpo.hpo(t).expect("could not resolve HPO term");
Expand All @@ -141,6 +153,11 @@ pub fn run_query(
}
})
.collect(),
genes: Vec::new(),
};
let mut result = query_result::Container {
version: crate::common::Version::new(&hpo.hpo_version()),
query,
result: Vec::new(),
};
for gene in genes {
Expand Down Expand Up @@ -214,6 +231,11 @@ pub fn run_query(
.collect::<Vec<_>>();
terms.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());

result.query.genes.push(query_result::Gene {
entrez_id: ncbi_gene_id,
gene_symbol: gene.name().to_string(),
});

result.result.push(query_result::Record {
gene_symbol: gene.name().to_string(),
// NB: we accept value truncation here ...
Expand All @@ -224,6 +246,10 @@ pub fn run_query(
});
}

// Sort genes for reproducibility.
result.query.genes.sort();

// Sort output records by score for reproducibility.
result
.result
.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
Expand Down Expand Up @@ -261,11 +287,11 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:

tracing::info!("Opening RocksDB for reading...");
let before_rocksdb = Instant::now();
let path_rocksdb = format!("{}/resnik", args.path_hpo_dir);
let path_rocksdb = format!("{}/scores-fun-sim-avg-resnik-gene", args.path_hpo_dir);
let db = rocksdb::DB::open_cf_for_read_only(
&rocksdb::Options::default(),
&path_rocksdb,
["meta", "resnik_pvalues"],
["meta", "scores"],
true,
)?;
tracing::info!("...done opening RocksDB in {:?}", before_rocksdb.elapsed());
Expand Down
4 changes: 2 additions & 2 deletions src/server/actix_server/hpo_genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use super::{CustomError, Match, ResultHpoTerm};
///
/// - `match` -- how to match
#[derive(serde::Deserialize, Debug, Clone)]
struct Request {
struct Query {
/// The gene ID to search for.
pub gene_id: Option<String>,
/// The gene symbol to search for.
Expand Down Expand Up @@ -97,7 +97,7 @@ impl ResultEntry {
async fn handle(
data: Data<WebServerData>,
_path: Path<()>,
query: web::Query<Request>,
query: web::Query<Query>,
) -> actix_web::Result<impl Responder, CustomError> {
let ontology = &data.ontology;
let match_ = query.match_.unwrap_or_default();
Expand Down
Loading