In [1]:
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt

import msprime
import tskit

import sys
sys.path.append("../")
from tsimpute import beagle_numba

In [2]:
N = 1_000
L = 1e6
rho = 1e-8
mu = 1e-8
ts = msprime.sim_mutations(
    msprime.sim_ancestry(
        N,
        sequence_length=L,
        discrete_genome=True,
        recombination_rate=rho,
        population_size=1e4,
    ),
    rate=mu,
)
ts

Tree Sequence,Unnamed: 1
Trees,2963
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,2000
Total Size,983.4 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,15077,471.2 KiB,
Individuals,1000,27.4 KiB,
Migrations,0,8 Bytes,
Mutations,3243,117.2 KiB,
Nodes,6173,168.8 KiB,
Populations,1,224 Bytes,✅
Provenances,2,1.8 KiB,
Sites,3238,79.1 KiB,

Provenance Timestamp,Software Name,Version,Command,Full record
"23 March, 2025 at 11:13:47 AM",msprime,1.3.3,sim_mutations,Details  dict  schema_version: 1.0.0  software:  dict  name: msprime version: 1.3.3  parameters:  dict  command: sim_mutations  tree_sequence:  dict  __constant__: __current_ts__  rate: 1e-08 model: None start_time: None end_time: None discrete_genome: None keep: None random_seed: 4088769200  environment:  dict  os:  dict  system: Darwin node: NDPH10047.lan release: 23.4.0 version: Darwin Kernel Version 23.4.0:<br/>Wed Feb 21 21:44:54 PST 2024; <br/>root:xnu-<br/>10063.101.15~2/RELEASE_ARM64_T<br/>6... machine: arm64  python:  dict  implementation: CPython version: 3.10.14  libraries:  dict  kastore:  dict  version: 2.1.1  tskit:  dict  version: 0.6.0  gsl:  dict  version: 2.8
"23 March, 2025 at 11:13:47 AM",msprime,1.3.3,sim_ancestry,Details  dict  schema_version: 1.0.0  software:  dict  name: msprime version: 1.3.3  parameters:  dict  command: sim_ancestry samples: 1000 demography: None sequence_length: 1000000.0 discrete_genome: True recombination_rate: 1e-08 gene_conversion_rate: None gene_conversion_tract_length: None population_size: 10000.0 ploidy: None model: None initial_state: None start_time: None end_time: None record_migrations: None record_full_arg: None additional_nodes: None coalescing_segments_only: None num_labels: None random_seed: 3897843085 replicate_index: 0  environment:  dict  os:  dict  system: Darwin node: NDPH10047.lan release: 23.4.0 version: Darwin Kernel Version 23.4.0:<br/>Wed Feb 21 21:44:54 PST 2024; <br/>root:xnu-<br/>10063.101.15~2/RELEASE_ARM64_T<br/>6... machine: arm64  python:  dict  implementation: CPython version: 3.10.14  libraries:  dict  kastore:  dict  version: 2.1.1  tskit:  dict  version: 0.6.0  gsl:  dict  version: 2.8


In [3]:
ne = 1e6
err_rate = 1e-4
cm = beagle_numba.convert_to_genetic_map_positions(ts.sites_position)
trans_probs = beagle_numba.get_transition_probs(cm, h=ts.num_samples, ne=ne)
mismatch_probs = beagle_numba.get_mismatch_probs(ts.num_sites, error_rate=err_rate)

In [4]:
ref_h = ts.genotype_matrix(alleles=tskit.ALLELES_ACGT)
ref_h.shape

(3238, 2000)

In [5]:
query_h = np.random.choice(a=np.arange(4, dtype=np.int32), size=ts.num_sites)

In [6]:
fwd_mat = beagle_numba.compute_forward_matrix_beaglelike(
    ref_h=ref_h,
    query_h=query_h,
    trans_probs=trans_probs,
    mismatch_probs=mismatch_probs,
    num_alleles=4,
)
fwd_mat.shape

(3238, 2000)

In [7]:
fwd_mat_refactored = beagle_numba.compute_forward_matrix(
    ref_h=ref_h,
    query_h=query_h,
    trans_probs=trans_probs,
    mismatch_probs=mismatch_probs,
    num_alleles=4,
)
fwd_mat_refactored.shape

(3238, 2000)

In [8]:
np.testing.assert_array_almost_equal(fwd_mat, fwd_mat_refactored)

In [9]:
bwd_mat = beagle_numba.compute_backward_matrix_beaglelike(
    ref_h=ref_h,
    query_h=query_h,
    trans_probs=trans_probs,
    mismatch_probs=mismatch_probs,
    num_alleles=4,
)
bwd_mat.shape

(3238, 2000)

In [10]:
bwd_mat_refactored = beagle_numba.compute_backward_matrix(
    ref_h=ref_h,
    query_h=query_h,
    trans_probs=trans_probs,
    mismatch_probs=mismatch_probs,
    num_alleles=4,
)
bwd_mat_refactored.shape

(3238, 2000)

In [11]:
state_mat = np.multiply(fwd_mat, bwd_mat)
state_mat_refactored = np.multiply(fwd_mat_refactored, bwd_mat_refactored)
np.testing.assert_array_almost_equal(state_mat, state_mat_refactored)