In [None]:
import numpy
import random
from itertools import product
import dagsim.base as ds

In [None]:

BASE_SEQ_FN = "olga.txt"
AIRR_SIZE = 10


def _get_olga_seq(protocol):
    for line in open(BASE_SEQ_FN):
        seq = line.strip()
        if len(seq) < 10:
            continue
        if protocol == 1 and not seq.startswith("ASS"):
            continue
        yield seq


def assign_protocol(disease):
    return numpy.random.binomial(1, 0.1 + 0.8 * disease)


def create_airr(disease, age, protocol):
    airr = []
    left = AIRR_SIZE
    for seq in _get_olga_seq(protocol):
        if left == 0:
            break
        if disease == 1:
            seq = seq[0:5] + _get_signal() + seq[8:len(seq)]
        clono_size = _get_clono_size(age, left)
        left -= clono_size
        airr.append((seq, clono_size))
    assert left == 0
    return airr


def _get_clono_size(age, max_left):
    return min(numpy.random.lognormal((120 - age) / 20, 1.5), max_left)


def _get_signal():
    return random.choice(["CAT", "CAR", "CAS", "DOG"])


def encode_kmers(airr):
    # alphabet = "ARNDCQEGHILKMFPOSUTWYVBZXJ"
    seq = airr[0][0]
    alphabet = "ACSTRDOG"
    k = 3
    kmers = sorted(list([''.join(x) for x in product(*[alphabet] * k)]))
    counts = dict([(kmer, 0) for kmer in kmers])
    for i in range(len(seq) - k + 1):
        sub = seq[i:i + k]
        counts[sub] += 1
    occ_vector = [counts[kmer] for kmer in kmers]
    return occ_vector

## Define the simulation using Python code

In [None]:
disease = ds.Generic(name="disease", function=numpy.random.binomial, kwargs={"n": 1, "p": 0.5})
age = ds.Generic(name="age", function=lambda l, h: int(numpy.random.uniform(l, h)),
                 kwargs={"l": 10, "h": 80})
protocol = ds.Generic(name="protocol", function=assign_protocol, kwargs={"disease": disease})
airr = ds.Generic(name="airr", function=create_airr, kwargs={"disease": disease, "age": age, "protocol": protocol},
                  observed=True)
kmer_vec = ds.Generic(name="kmer_vec", function=encode_kmers, kwargs={"airr": airr})

In [None]:
graph = ds.Graph(name="graph", list_nodes=[disease, age, protocol, airr, kmer_vec])
graph.draw()

In [None]:
data = graph.simulate(num_samples=50, csv_name="BioseqExample_v1")

## Define the simulation using YAML

In [None]:
from dagsim.utils.parser import Parser

parser = Parser("BioseqExample_v1.yaml")

data = parser.parse(verbose=False, draw=False)