# A Wright-Fisher simulation implemented in C via Cython.

OMG!

We would use GSL via CythonGSL, but that would require a GPL license for this notebook, but we're doing CCBY.

In [1]:
%load_ext Cython

In [2]:
import msprime
import numpy as np

  from ._conv import register_converters as _register_converters


In [45]:
%%cython -a

import msprime
import numpy as np
cimport numpy as np
from cython.view cimport array as cvarray
from libc.stdlib cimport malloc, realloc, free
from libc.stdint cimport int32_t, uint32_t

cdef int32_t * malloc_int32_t(size_t n):
    return <int32_t*>malloc(n*sizeof(int32_t))

cdef int32_t * realloc_int32_t(void * x, size_t n):
    return <int32_t*>realloc(x,n*sizeof(int32_t))

cdef double * malloc_double(size_t n):
    return <double*>malloc(n*sizeof(double))

cdef double * realloc_double(double * x, size_t n):
    return <double*>realloc(<double *>x,n*sizeof(double))

cdef struct Mutations:
    double * pos
    int32_t * time
    size_t next_mutation, capacity
    
cdef int init_Mutations(Mutations * m):
    m.next_mutation = 0
    m.capacity = 10000
    m.pos = malloc_double(m.capacity)
    if m.pos == NULL:
        return -1
    m.time = malloc_int32_t(m.capacity)
    if m.time == NULL:
        return -1
    return 0

cdef int realloc_Mutations(Mutations * m):
    m.capacity *= 2
    m.pos = realloc_double(m.pos,
                          m.capacity)
    if m.pos == NULL:
        return -1
    m.time = realloc_int32_t(m.time,
                            m.capacity)
    if m.time == NULL:
        return -1
    return 0

cdef void free_Mutations(Mutations * m):
    free(m.pos)
    free(m.time)
    m.next_mutation = 0
    m.capacity = 10000
    
cdef int add_mutation(double pos,
                     int32_t generation,
                     Mutations * m):
    cdef int rv = 0
    if m.next_mutation+1 >= m.capacity:
        rv = realloc_Mutations(m)
        if rv != 0:
            return rv
    m.pos[m.next_mutation] = pos
    m.time[m.next_mutation] = generation
    m.next_mutation+=1
    return rv
    
cdef struct Nodes:
    double * time
    size_t next_node, capacity
    
cdef int init_Nodes(Nodes * n):
    n.next_node = 0
    n.capacity = 10000
    n.time = malloc_double(n.capacity)
    if n.time == NULL:
        return -1
    return 0

cdef int realloc_Nodes(Nodes * n):
    n.capacity *= 2
    n.time = realloc_double(n.time,
                            n.capacity)
    if n.time == NULL:
        return -1
    return 0
    
cdef void free_Nodes(Nodes * n):
    if n.time != NULL:
        free(n.time)
    n.next_node = 0
    n.capacity = 10000

cdef int add_node(double t, Nodes *n):
    cdef int rv = 0
    if n.next_node >= n.capacity:
        rv = realloc_Nodes(n)
        if rv != 0:
            return rv
    n.time[n.next_node] = t
    n.next_node+=1
    return rv
    
cdef struct Edges:
    double *left
    double *right
    int32_t *parent
    int32_t *child
    size_t next_edge, capacity
    
cdef int init_Edges(Edges * e):
    e.next_edge = 0
    e.capacity = 10000
    e.left = malloc_double(e.capacity)
    if e.left == NULL:
        return -1
    e.right = malloc_double(e.capacity)
    if e.right == NULL:
        return -1
    e.parent = malloc_int32_t(e.capacity)
    if e.parent == NULL:
        return -1
    e.child = malloc_int32_t(e.capacity)
    if e.child == NULL:
        return -1
    return 0
   
cdef int realloc_Edges(Edges * e):
    e.capacity *= 2
    e.left = realloc_double(e.left,e.capacity)
    if e.left == NULL:
        return -1
    e.right = realloc_double(e.right,e.capacity)
    if e.right == NULL:
        return -1
    e.parent = realloc_int32_t(e.parent,e.capacity)
    if e.parent == NULL:
        return -1
    e.child = realloc_int32_t(e.child,e.capacity)
    if e.child == NULL:
        return -1
    return 0

cdef void free_Edges(Edges * e):
    free(e.left)
    free(e.right)
    free(e.parent)
    free(e.child)
    e.next_edge = 0
    e.capacity = 10000
    
cdef int add_edge(double left, double right,
             int32_t parent, int32_t child,
             Edges * edges):
    cdef int rv=0
    if edges.next_edge+1 >= edges.capacity:
        rv = realloc_Edges(edges)
        if rv != 0:
            return rv
        
    edges.left[edges.next_edge] = left
    edges.right[edges.next_edge] = right
    edges.parent[edges.next_edge] = parent
    edges.child[edges.next_edge] = child
    edges.next_edge += 1
    return rv

cdef void cleanup(Nodes * n, Edges * e, Mutations * m):
    free_Nodes(n)
    free_Edges(e)
    free_Mutations(m)
    
cdef int infsites(double mu, int32_t generation,
                  Mutations * mutations,
                  dict lookup):
    cdef unsigned nmut = np.random.poisson(mu)
    cdef unsigned i = 0
    cdef np.ndarray[double,ndim=1] pos
    cdef int rv = 0
    for i in range(nmut):
        pos = np.random.random_sample(1)
        while pos[0] in lookup:
            pos = np.random.random_sample(1)
        rv = add_mutation(pos[0],
                         generation,
                         mutations)
        if rv != 0:
            return rv
        lookup[pos[0]] = True
    return rv

cdef int poisson_recombination(double r,
                                tuple parent_indexes,
                                int32_t next_offspring_id,
                                Edges * edges):
    cdef unsigned nbreaks = np.random.poisson(r)
    cdef list b = []
    cdef unsigned i = 0
    cdef np.ndarray[double,ndim=1] x
    cdef list pgams
    cdef int rv = 0
    cdef double left,right
    cdef int32_t p
    if nbreaks == 0:
        # The parent passes the entire region onto the child
        rv = add_edge(0.0,1.0,parent_indexes[0],
                      next_offspring_id,edges)
        if rv != 0:
            return rv
    else:
        while i < nbreaks:
            x = np.random.random_sample(1)
            while x[0] in b:
                x = np.random.random_sample(1)
            b.append(x[0])
            i += 1
        b.sort()
        b.append(1.0)

        if b[0] != 0.0:
            b.insert(0,0.0)
        else:
            parent_indexes = (parent_indexes[1], parent_indexes[0])
        
        pgams = list([*tuple(i for i in parent_indexes)]*int(len(b)/2))

        for left,right,p in zip(b[0:len(b)-1],b[1:len(b)],pgams):
            rv = add_edge(left,right,p,
                          next_offspring_id,edges)
            if rv != 0:
                return rv
    return 0

def evolve(int N, int ngens, double theta, double rho, int gc):
    nodes = msprime.NodeTable()
    edges = msprime.EdgeTable()
    mutations = msprime.MutationTable()
    
    cdef double mu = theta/<double>(4*N)
    cdef double r = rho/<double>(4*N)
    
    cdef int rv
    cdef size_t i, generation
    cdef Nodes temp_nodes
    cdef Edges temp_edges
    cdef Mutations temp_mutations
    rv = init_Nodes(&temp_nodes)
    if rv != 0:
        cleanup(&temp_nodes,&temp_edges,&temp_mutations)
        raise RuntimeError("could not initialize temp_nodes")
    rv = init_Edges(&temp_edges)
    if rv != 0:
        cleanup(&temp_nodes,&temp_edges,&temp_mutations)
        raise RuntimeError("could not initialize temp_edges")
    rv = init_Mutations(&temp_mutations)
    if rv != 0:
        cleanup(&temp_nodes,&temp_edges,&temp_mutations)
        raise RuntimeError("could not initialize temp_mutations")
        
    for i in range(2*<size_t>N):
        nodes.add_row(time=0.0,
                      flags=msprime.NODE_IS_SAMPLE)
        
    
    cdef int32_t next_offspring_index, first_parental_index
    next_offspring_index = len(nodes)
    first_parental_index = 0
    PARENT_DTYPE = np.int32
    cdef np.ndarray[int32_t,ndim=1] parents
    cdef double mendel[2]
    cdef size_t parent1, parent2,pindex
    cdef int32_t[:] pview
    cdef int32_t p1g1, p1g2, p2g1, p2g2
    cdef dict lookup = {}
    for generation in range(1,<size_t>(ngens+1)):
        parents = np.random.randint(0, N, 2*N, dtype=PARENT_DTYPE)
        pview = parents
        for pindex in range(0,2*N,2):
            parent1=pview[pindex]
            parent2=pview[pindex+1]
            p1g1 = first_parental_index + 2*parent1
            p1g2 = p1g1 + 1
            p2g1 = first_parental_index + 2*parent2
            p2g2 = p2g1 + 1
            
            mendel = np.random.random_sample(2)
            if mendel[0] < 0.5:
                p1g1, p1g2 = p1g2, p1g1
            if mendel[1] < 0.5:
                p2g1, p2g2 = p2g2, p2g1
                
            rv = poisson_recombination(r,(p1g1,p1g2),
                                      next_offspring_index,
                                      &temp_edges)
            if rv != 0:
                cleanup(&temp_nodes,&temp_edges,
                       &temp_mutations)
                raise RuntimeError("error during recombination")
                
            rv = infsites(mu,generation,
                         &temp_mutations,lookup)
            
            if rv != 0:
                cleanup(&temp_nodes,&temp_edges,
                       &temp_mutations)
                raise RuntimeError("error during mutation")
                
            rv = add_node(<double>generation, &temp_nodes)
            if rv != 0:
                cleanup(&temp_nodes,&temp_edges,
                       &temp_mutations)
                raise RuntimeError("error during adding nodes")
                
            next_offspring_index += 1
                
            rv = poisson_recombination(r,(p2g1,p2g2),
                                      next_offspring_index,
                                      &temp_edges)
            if rv != 0:
                cleanup(&temp_nodes,&temp_edges,
                       &temp_mutations)
                raise RuntimeError("error during recombination")
                
            rv = infsites(mu,generation,
                         &temp_mutations,lookup)
            
            if rv != 0:
                cleanup(&temp_nodes,&temp_edges,
                       &temp_mutations)
                raise RuntimeError("error during mutation")
                
            rv = add_node(<double>generation, &temp_nodes)
            if rv != 0:
                cleanup(&temp_nodes,&temp_edges,
                       &temp_mutations)
                raise RuntimeError("error during adding nodes")
                
            next_offspring_index += 1
        first_parental_index += 2*N
    
    # Push first nodes times further back
    nodes.set_columns(time=nodes.time + ngens +1,
                     flags=nodes.flags)
            
    # Add our data to the tables
    cdef double[:] timeview = <double[:temp_nodes.next_node]>temp_nodes.time
    time=np.asarray(timeview,np.float)
    time-=time.max()
    time*=-1.0
    nodes.append_columns(time=time,
                         flags=np.ones(temp_nodes.next_node,np.uint32))
    edges.append_columns(left=np.asarray(<double[:temp_edges.next_edge]>temp_edges.left),
                        right=np.asarray(<double[:temp_edges.next_edge]>temp_edges.right),
                        parent=np.asarray(<int32_t[:temp_edges.next_edge]>temp_edges.parent),
                        child=np.asarray(<int32_t[:temp_edges.next_edge]>temp_edges.child))
    
    print(nodes.time.min(),nodes.time.max())

    msprime.sort_tables(nodes=nodes,edges=edges)
    
    samples = np.where(nodes.time==0)[0]
    
    print(samples)
    
    msprime.simplify_tables(samples=samples.tolist(),
                           nodes=nodes,
                           edges=edges)
                
    cleanup(&temp_nodes,&temp_edges,&temp_mutations)
    
    return msprime.load_tables(nodes=nodes,edges=edges)
    
    
def test_infsites():
    cdef Mutations m
    init_Mutations(&m)
    cdef dict lookup = {}
    cdef int rv = infsites(100000,1,&m,lookup)
    print(m.next_mutation,m.capacity)
    free_Mutations(&m)
    print(m.next_mutation,m.capacity)
    print("done!")
    
def test_add_edges():
    cdef Edges e
    cdef int rv = init_Edges(&e)
    print(e.next_edge,e.capacity)
    print(e.left == NULL)
    print(e.right == NULL)
    print(e.parent == NULL)
    print(e.child == NULL)
    print(rv)
    for i in range(20000):
        rv=add_edge(0,1,0,1,&e)
        #print(e.next_edge,e.capacity)
        # print(i,rv)
        if rv != 0:
            raise RuntimeError("error adding edges")
    print(e.next_edge,e.capacity)

In [41]:
evolve(100, 1000, 10.0, 10.0, 10)

-0.0 1001.0
[200000 200001 200002 200003 200004 200005 200006 200007 200008 200009
 200010 200011 200012 200013 200014 200015 200016 200017 200018 200019
 200020 200021 200022 200023 200024 200025 200026 200027 200028 200029
 200030 200031 200032 200033 200034 200035 200036 200037 200038 200039
 200040 200041 200042 200043 200044 200045 200046 200047 200048 200049
 200050 200051 200052 200053 200054 200055 200056 200057 200058 200059
 200060 200061 200062 200063 200064 200065 200066 200067 200068 200069
 200070 200071 200072 200073 200074 200075 200076 200077 200078 200079
 200080 200081 200082 200083 200084 200085 200086 200087 200088 200089
 200090 200091 200092 200093 200094 200095 200096 200097 200098 200099
 200100 200101 200102 200103 200104 200105 200106 200107 200108 200109
 200110 200111 200112 200113 200114 200115 200116 200117 200118 200119
 200120 200121 200122 200123 200124 200125 200126 200127 200128 200129
 200130 200131 200132 200133 200134 200135 200136 200137 200138 2

<msprime.trees.TreeSequence at 0x1129a3ac8>

In [5]:
test_infsites()

99865 160000
0 10000
done!


In [6]:
test_add_edges()

0 10000
False
False
False
False
0
20000 40000
