# Compile in vivo off-target data for transfer learning

We will use the compiled data from CRISPR-Net: A Recurrent Convolutional Network Quantifies CRISPR Off-Target Activities with Mismatches and Indels, by Lin et al., *Advanced Science*, 2020.

The data and code files can be downloaded from the CodeOccean pod here: https://codeocean.com/capsule/9553651/tree/v1 . Great job on making the work reproducible!

Please note that `pd.read_pickle` from these pre-compiled .pkl files requires `pandas.__version__~='1.0.3'`

Below is the README from `CRISPR_Net/data/`

| Name | Location in data/ | Technique |with Indel| Lierature
| ----:| :---- |----: |----: |----: |
| Dataset I-1| Dataset I (indel&mismatch) |CIRCLE-Seq|Yes| Tsai et al., Nat Method, 2017|
| Dataset I-2| Dataset I (indel&mismatch) |GUIDE-Seq|Yes| Listgarten et al., Nat BME, 2018 |
| Dataset II-1| Dataset II (mismatch-only) |protein knockout detection|No| Doench et al., Nat biotech, 2016 |
| Dataset II-2| Dataset II (mismatch-only) |PCR, Diggenome-Seq, etc|No| Haeussler et al., Genome bio, 2016|
| Dataset II-3| Dataset II (mismatch-only) |SITE-Seq|No|Cameron et al., Nature Methods, 2017 |
| Dataset II-4| Dataset II (mismatch-only) |GUIDE-Seq|No| Tsai et al., Nat biotech, 2015|
| Dataset II-5| Dataset II (mismatch-only) |GUIDE-Seq|No| Kleinstiver et al., Nature, 2015|
| Dataset II-6| Dataset II (mismatch-only) |GUIDE-Seq|No| Listgarten et al., Nat BME, 2018 |

--------------------------------------------------
The /code/aggregate_models/CRISPR_Net_weights.h5 was trained on dataset I-1, II-1, II-2, and II-4.

The /code/scoring_models/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5 was trained on dataset I-1, II-1, II-2, II-3, and II-4.




In [1]:
%cd ..

/mnt/ceph/users/zzhang/CRISPR_pred/crispr_kinn


In [2]:
import pandas as pd
import numpy as np
import pickle as pkl
import h5py
import os
from collections import defaultdict
# we override the sequence encoder for our KINN use
from src.encode_seq import Encoder
from src.data import load_finkelstein_data

Using TensorFlow backend.


In [3]:
DATA_DIR = "./baselines/CRISPR_Net/data/"
DAT_I = "Dataset_I_indel_mismatch"
DAT_II = "Dataset_II_mismatch"

In [4]:
np.random.seed(42)
def split_data_by_grnas(on_seq_to_idx, codes, labels, split_ratio=0.2):
    n_grna = len(on_seq_to_idx)
    leave_out_gr = np.random.choice([k for k in on_seq_to_idx], int(np.ceil(n_grna*split_ratio)), replace=False)
    leave_out_idx = np.concatenate([on_seq_to_idx[x] for x in leave_out_gr])
    train_x, valid_x = np.delete(codes, leave_out_idx, axis=0), codes[leave_out_idx]
    train_y, valid_y = np.delete(labels, leave_out_idx, axis=0), labels[leave_out_idx]
    print(f"Split total n_grna {n_grna}, n_pos {np.sum(labels, dtype=int)}, train datapoints {len(train_x)} / {np.sum(train_y, dtype=int)}, valid datapoints {len(valid_x)} / {np.sum(valid_y, dtype=int)}")
    return (train_x, train_y), (valid_x, valid_y)

In [5]:
# for training
# I-1
def load_CIRCLE_data():
    print("Encoding CIRCLE-seq dataset (dataset I/1)...")
    circle_data = pd.read_csv(f"{DATA_DIR}/{DAT_I}/dataset_I-1/CIRCLE_seq_10gRNA_wholeDataset.csv")
    circle_codes = []
    circle_labels = []
    on_seq_to_idx = defaultdict(list)
    i = 0
    for idx, row in circle_data.iterrows():
        on_seq = row['sgRNA_seq']
        off_seq = row['off_seq']
        # keep on_seq as keys
        on_seq_key = on_seq.replace('-', '').replace('_', '')
        on_seq_to_idx[on_seq_key].append(i)
        i += 1
        label = row['label']
        read_val = row['Read']
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_category=True, label=label)
        en.encode_on_off()
        circle_codes.append(en.on_off_code)
        circle_labels.append(label)
    circle_codes = np.array(circle_codes)
    circle_labels = np.array(circle_labels)
    print("Finished!", "Dataset size:", circle_codes.shape, len(circle_labels[circle_labels>0]))
    train, valid = split_data_by_grnas(on_seq_to_idx=on_seq_to_idx, codes=circle_codes, labels=circle_labels)
    #return circle_codes, circle_labels
    return train, valid


# II-1
def load_elevation_CD33_dataset():
    print("Loading dataset II/1...")
    cd33_data = pd.read_pickle(f"{DATA_DIR}/{DAT_II}/"
                               + "/Listgarten_ElevationDataset-dataset_II-1_II-2_II-4/cd33_dataset_II-1.pkl")
    cd33_mut = cd33_data[0]
    cd33_code = []
    label = []
    # set up on-seq recorder
    on_seq_to_idx = defaultdict(list)
    i = 0
    for idx, row in cd33_mut.iterrows():
        on_seq = row['30mer']
        off_seq = row['30mer_mut']
        # keep on_seq as keys
        on_seq_key = on_seq.replace('-', '').replace('_', '')
        on_seq_to_idx[on_seq_key].append(i)
        i += 1
        etp_val = row['Day21-ETP']
        etp_label = row['Day21-ETP-binarized']
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_reg_val=True, value=etp_val)
        en.encode_on_off()
        cd33_code.append(en.on_off_code)
        label.append(etp_label)
    label = np.array(label)
    cd33_code = np.array(cd33_code)
    print("Finished!", cd33_code.shape, len(label[label>0]))
    train, valid = split_data_by_grnas(on_seq_to_idx=on_seq_to_idx, codes=cd33_code, labels=np.array(label))
    return train, valid


# II-2
def load_elevation_hmg_dataset():
    print("Loading dataset II/2...")
    hmg_data = pd.read_pickle(f"{DATA_DIR}/{DAT_II}/Listgarten_ElevationDataset-dataset_II-1_II-2_II-4/hmg_data_dataset_II-2.pkl")
    hmg_code = []
    hmg_vals = []
    # set up on-seq recorder
    on_seq_to_idx = defaultdict(list)
    i = 0
    for idx, row in hmg_data.iterrows():
        on_seq = row['30mer']
        off_seq = row['30mer_mut']
        # keep on_seq as keys
        on_seq_key = on_seq.replace('-', '').replace('_', '')
        on_seq_to_idx[on_seq_key].append(i)
        i += 1
        reg_val = row['readFraction']
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_reg_val=True, value=reg_val)
        en.encode_on_off()
        hmg_code.append(en.on_off_code)
        hmg_vals.append(en.value)

    hmg_vals = np.array(hmg_vals)
    hmg_code = np.array(hmg_code)
    hmg_label = np.zeros(len(hmg_vals))
    hmg_label[hmg_vals>0] = 1
    print("Finished!", "dataset size: ", hmg_code.shape, len(hmg_label[hmg_label>0]))
    train, valid = split_data_by_grnas(on_seq_to_idx=on_seq_to_idx, codes=np.array(hmg_code), labels=hmg_label)
    return train, valid
    

# II-3
def load_siteseq_data():
    print("Loading SITE-Seq dataset (dataset II/3) .....")
    siteseq_data = pd.read_csv(f"{DATA_DIR}/{DAT_II}/dataset_II-3/SITE-Seq_offTarget_wholeDataset.csv", index_col=0)
    code = []
    reads = []
    # set up on-seq recorder
    on_seq_to_idx = defaultdict(list)
    i = 0
    for idx, row in siteseq_data.iterrows():
        on_seq = '-'+row['on_seq'].upper()
        off_seq = '-'+row['off_seq'].upper()
        # keep on_seq as keys
        on_seq_key = on_seq.replace('-', '').replace('_', '')
        on_seq_to_idx[on_seq_key].append(i)
        i += 1
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_reg_val=True, value=row['reads'])
        en.encode_on_off()
        code.append(en.on_off_code)
        reads.append(en.value)
    code = np.array(code)
    reads = np.array(reads)
    labels = np.zeros(len(reads))
    labels[reads > 0] = 1
    print(len(on_seq_to_idx), code.shape, len(labels[labels>0]))
    train, valid = split_data_by_grnas(on_seq_to_idx=on_seq_to_idx, codes=code, labels=labels)
    return train, valid


# II-4
def load_elevation_guideseq_data():
    print("Loading dataset II/4...")
    guideseq_data = pd.read_pickle(f"{DATA_DIR}/{DAT_II}/Listgarten_ElevationDataset-dataset_II-1_II-2_II-4/guideseq_data_dataset_II-3.pkl")
    guideseq_code = []
    guideseq_vals = []
    # set up on-seq recorder
    on_seq_to_idx = defaultdict(list)
    i = 0
    for idx, row in guideseq_data.iterrows():
        on_seq = row['30mer']
        off_seq = row['30mer_mut']
        reg_val = row['GUIDE-SEQ Reads']
        # keep on_seq as keys
        on_seq_key = on_seq.replace('-', '').replace('_', '')
        on_seq_to_idx[on_seq_key].append(i)
        i += 1
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_reg_val=True, value=reg_val)
        en.encode_on_off()
        guideseq_code.append(en.on_off_code)
        guideseq_vals.append(en.value)

    guideseq_code = np.array(guideseq_code)
    guideseq_vals = np.array(guideseq_vals)
    guideseq_labels = np.zeros(len(guideseq_vals))
    guideseq_labels[guideseq_vals > 0] = 1
    print("Dataset size:", guideseq_code.shape, "positive num:", len(guideseq_labels[guideseq_labels > 0]))
    train, valid = split_data_by_grnas(on_seq_to_idx=on_seq_to_idx, codes=np.array(guideseq_code), labels=np.array(guideseq_labels))
    return train, valid


In [6]:
d1 = load_CIRCLE_data()
d2 = load_elevation_CD33_dataset()
d3 = load_elevation_hmg_dataset()
d4 = load_siteseq_data()
d5 = load_elevation_guideseq_data()

Encoding CIRCLE-seq dataset (dataset I/1)...
Finished! Dataset size: (584949, 25, 13) 7371
Split total n_grna 40, n_pos 7371, train datapoints 417555 / 5593, valid datapoints 167394 / 1778
Loading dataset II/1...
Finished! (4853, 25, 13) 2273
Split total n_grna 1027, n_pos 2273, train datapoints 3819 / 1853, valid datapoints 1034 / 420
Loading dataset II/2...
Finished! dataset size:  (10129, 25, 13) 52
Split total n_grna 19, n_pos 52, train datapoints 7968 / 45, valid datapoints 2161 / 7
Loading SITE-Seq dataset (dataset II/3) .....
9 (217733, 25, 13) 3767
Split total n_grna 9, n_pos 3767, train datapoints 180000 / 2799, valid datapoints 37733 / 968
Loading dataset II/4...
Dataset size: (294534, 25, 13) positive num: 354
Split total n_grna 36, n_pos 354, train datapoints 232956 / 276, valid datapoints 61578 / 78


In [7]:
train_data = np.concatenate([x[0][0] for x in (d1, d2, d3, d4, d5)]), np.concatenate([x[0][1] for x in (d1, d2, d3, d4, d5)])
train_data[0].shape, train_data[1].shape


((842298, 25, 13), (842298,))

In [8]:
valid_data = np.concatenate([x[1][0] for x in (d1, d2, d3, d4, d5)]), np.concatenate([x[1][1] for x in (d1, d2, d3, d4, d5)])
valid_data[0].shape, valid_data[1].shape


((269900, 25, 13), (269900,))

In [9]:
# we use the kinetic finkelstein data as validation set
# to compute NAS rewards
t1, _ = load_finkelstein_data(target='wtCas9_cleave_rate_log', make_switch=False, logbase=10, include_ref=True)
t2, _ = load_finkelstein_data(target='wtCas9_cleave_rate_log', make_switch=True, logbase=10, include_ref=True)

x_valid = np.concatenate([t1[0], t2[0]])
k_valid = np.concatenate([t1[1], t2[1]])
kinetic_label = np.zeros(len(k_valid))
kinetic_label[k_valid > -5] = 1

kinetic_data = x_valid, k_valid, kinetic_label
print("valid positive size", (kinetic_label==1).sum())
kinetic_data[0].shape, kinetic_data[1].shape

valid positive size 6976


((13993, 25, 13), (13993,))

In [10]:
# for testing
# I-2 is missing in the original code pod, but present in dataset
def load_listgarten_indel_dataset():
    print("Loading Listgarten indel dataset (dataset I/2)...")
    df = pd.read_csv(f"{DATA_DIR}/{DAT_I}/dataset_I-2/elevation_6gRNA_wholeDataset.csv")
    code = []
    labels = []
    for idx, row in df.iterrows():
        on_seq = row['crRNA'].upper()
        off_seq = row['DNA'].upper()
        #  print(idx, on_seq)
        label = row['label']
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_category=True, label=label)
        en.encode_on_off()
        code.append(en.on_off_code)
        labels.append(en.label)
    labels = np.array(labels)
    code = np.array(code)
    print("Finished!")
    print(code.shape, len(labels[labels > 0]))
    return code, labels

    
# II-5
def load_Kleinstiver_data():
    print("Loading Kleinsitver dataset (dataset II/5)...")
    sgRNA5_data = pd.read_csv(f"{DATA_DIR}/{DAT_II}/dataset_II-5/Kleinstiver_5gRNA_wholeDataset.csv")
    sgRNA5_code = []
    sgRNA5_labels = []
    for idx, row in sgRNA5_data.iterrows():
        on_seq = row['sgRNA_seq'].upper()
        off_seq = row['off_seq'].upper()
        #  print(idx, on_seq)
        label = row['label']
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_category=True, label=label)
        en.encode_on_off()
        sgRNA5_code.append(en.on_off_code)
        sgRNA5_labels.append(en.label)
    sgRNA5_labels = np.array(sgRNA5_labels)
    sgRNA5_code = np.array(sgRNA5_code)
    print("Finished!")
    print(sgRNA5_code.shape, len(sgRNA5_labels[sgRNA5_labels > 0]))
    return sgRNA5_code, sgRNA5_labels


# II-6
def load_22sgRNA_data():
    print("Loading Listgarten dataset II/6...")
    sgRNA22_data = pd.read_csv(f"{DATA_DIR}/{DAT_II}/dataset_II-6/Listgarten_22gRNA_wholeDataset.csv")
    sgRNA22_code = []
    sgRNA22_labels = []
    for idx, row in sgRNA22_data.iterrows():
        on_seq = row['sgRNA_seq'].upper()
        # print(idx, on_seq)
        off_seq = row['off_seq'].upper()
        label = row['label']
        en = Encoder(on_seq=on_seq, off_seq=off_seq, with_category=True, label=label)
        en.encode_on_off()
        sgRNA22_code.append(en.on_off_code)
        sgRNA22_labels.append(en.label)
    sgRNA22_labels = np.array(sgRNA22_labels)
    sgRNA22_code = np.array(sgRNA22_code)
    print("Finished!", "Dataset size: ", np.array(sgRNA22_code).shape, len(sgRNA22_labels[sgRNA22_labels > 0]))
    return np.array(sgRNA22_code), np.array(sgRNA22_labels)

In [11]:
t1 = load_listgarten_indel_dataset()
t2 = load_Kleinstiver_data()
t3 = load_22sgRNA_data()

Loading Listgarten indel dataset (dataset I/2)...
Finished!
(213943, 25, 13) 60
Loading Kleinsitver dataset (dataset II/5)...
Finished!
(95829, 25, 13) 54
Loading Listgarten dataset II/6...
Finished! Dataset size:  (383463, 25, 13) 56


In [12]:
# test data is actually sperate evaluated within each dataset
test_data = np.concatenate([x[0] for x in (t1, t2, t3)]), np.concatenate([x[1] for x in (t1, t2, t3)])
test_data[0].shape, test_data[1].shape

((693235, 25, 13), (693235,))

## Store Data in h5py

In [13]:
with h5py.File("./data/inVivoData.newValidSplit.h5", "w") as store:
    train = store.create_group("train")
    train.create_dataset("x", data=train_data[0])
    train.create_dataset("y", data=train_data[1])
    
    valid = store.create_group("valid")
    valid.create_dataset("x", data=valid_data[0])
    valid.create_dataset("y", data=valid_data[1])

    kinetic = store.create_group("kinetic")
    kinetic.create_dataset("x", data=kinetic_data[0])
    kinetic.create_dataset("k", data=kinetic_data[1])
    kinetic.create_dataset("y", data=kinetic_data[2])

    store.create_dataset("test/Listgarten_indel/x", data=t1[0])
    store.create_dataset("test/Listgarten_indel/y", data=t1[1])

    store.create_dataset("test/Kleinsitver_mut/x", data=t2[0])
    store.create_dataset("test/Kleinsitver_mut/y", data=t2[1])
    
    store.create_dataset("test/Listgarten_mut/x", data=t3[0])
    store.create_dataset("test/Listgarten_mut/y", data=t3[1])
