In [35]:
import numpy.random
import random
from itertools import product
from dagsim.base import Graph, Node

In [36]:

BASE_SEQ_FN = "olga.txt"
AIRR_SIZE = 10


def _get_olga_seq(protocol):
    for line in open(BASE_SEQ_FN):
        seq = line.strip()
        if len(seq) < 10:
            continue
        if protocol == 1 and not seq.startswith("ASS"):
            continue
        yield seq


def assign_protocol(disease):
    return numpy.random.binomial(1, 0.1 + 0.8 * disease)


def create_airr(disease, age, protocol):
    airr = []
    left = AIRR_SIZE
    for seq in _get_olga_seq(protocol):
        if left == 0:
            break
        if disease == 1:
            seq = seq[0:5] + _get_signal() + seq[8:len(seq)]
        clono_size = _get_clono_size(age, left)
        left -= clono_size
        airr.append((seq, clono_size))
    assert left == 0
    return airr


def _get_clono_size(age, max_left):
    return min(numpy.random.lognormal((120 - age) / 20, 1.5), max_left)


def _get_signal():
    return random.choice(["CAT", "CAR", "CAS", "DOG"])


def encode_kmers(airr):
    # alphabet = "ARNDCQEGHILKMFPOSUTWYVBZXJ"
    seq = airr[0][0]
    alphabet = "ACSTRDOG"
    k = 3
    kmers = sorted(list([''.join(x) for x in product(*[alphabet] * k)]))
    counts = dict([(kmer, 0) for kmer in kmers])
    for i in range(len(seq) - k + 1):
        sub = seq[i:i + k]
        counts[sub] += 1
    occ_vector = [counts[kmer] for kmer in kmers]
    return occ_vector

## Define the simulation using Python code

In [37]:
disease = Node(name="Disease", function=numpy.random.binomial, kwargs={"n": 1, "p": 0.5})
age = Node(name="Age", function=numpy.random.randint, kwargs={"low": 10, "high": 80})
protocol = Node(name="Protocol", function=assign_protocol, kwargs={"disease": disease})
airr = Node(name="AIRR", function=create_airr, kwargs={"disease": disease, "age": age, "protocol": protocol},
                  observed=True)
kmer_vec = Node(name="kmerVec", function=encode_kmers, kwargs={"airr": airr})

In [41]:
graph = Graph(name="graph", list_nodes=[disease, age, protocol, airr, kmer_vec])
graph.draw()

KeyError: 'Node'

In [40]:
data = graph.simulate(num_samples=50, csv_name="BioseqExample_v1")

Simulation started


TypeError: unsupported operand type(s) for *: 'float' and 'Node'

## Define the simulation using YAML

In [26]:
from dagsim.utils.parser import Parser

parser = Parser("BioseqExample_v1.yaml")

data = parser.parse(verbose=False, draw=False)

ModuleNotFoundError: No module named 'BioseqExample_v1.py'; 'BioseqExample_v1' is not a package