In [1]:
# Run on CPU not GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import pandas as pd
import primo.tools.sequences as seqtools
from primo.models.simulator import Simulator
import numpy as np
from tqdm import tqdm

def random_mutants(anchor, n):
    """Generate n random mutants of anchor sequence
    
    The hamming distances between the pairs are drawn from
    a uniform distribution.
    """
    
    mut_rates = np.random.uniform(0, 1, size=n)
    pairs = np.array([seqtools.mutate(anchor, rate) for rate in mut_rates])
    seq_hdists = np.array(
        [ seqtools.seq_hdist(anchor, p) for p in pairs ]
    )
    return pairs, seq_hdists
    
def collect_cas9_table(anchor, n=1000):
    pairs, hdists = random_mutants(anchor, n)
    
    df = pd.DataFrame({
        "target_features": [anchor] * n,
        "query_features": pairs
    })
    
    simulator = Simulator()
    activations = simulator.simulate(df)
    
    df['activations'] = activations
    df['hdists'] = hdists
    
    return df
    
N_ANCHORS = 10
N_PAIRS = 10000

try:
    result_store = pd.HDFStore('/tf/primo/data/stats/lo_random_pairings_cas9.h5', complevel=9, mode='w')
    for _ in tqdm(range(N_ANCHORS)):
        test_seq = seqtools.randseq(20)
        df = collect_cas9_table(test_seq, N_PAIRS)
        result_store.append('df', df)
finally:
    result_store.close()

100%|██████████| 10/10 [00:40<00:00,  4.03s/it]
