In [1]:
%cd /mnt/home/zzhang/ceph/CRISPR_pred/crispr_kinn

/mnt/ceph/users/zzhang/CRISPR_pred/crispr_kinn


In [2]:
import numpy as np
import pandas as pd
from Bio import pairwise2
from Bio.Seq import Seq
from sklearn.preprocessing import scale



In [3]:
# pre-defined letter index
# match : 4 letters, 0-3
ltidx = {(x,x):i for i,x in enumerate('ACGT')}
# substitution : x->y, 4-7
ltidx.update({(x,y):(ltidx[(x,x)], i+4) for x in 'ACGT' for i, y in enumerate('ACGT') if y!=x })
# insertion : NA->y, 8-11
ltidx.update({('-', x):i+8 for i,x in enumerate('ACGT')})
# deletion : x->NA, 12
ltidx.update({(x,'-'):(ltidx[(x,x)], 12) for i,x in enumerate('ACGT')})
ltidx

{('A', 'A'): 0,
 ('C', 'C'): 1,
 ('G', 'G'): 2,
 ('T', 'T'): 3,
 ('A', 'C'): (0, 5),
 ('A', 'G'): (0, 6),
 ('A', 'T'): (0, 7),
 ('C', 'A'): (1, 4),
 ('C', 'G'): (1, 6),
 ('C', 'T'): (1, 7),
 ('G', 'A'): (2, 4),
 ('G', 'C'): (2, 5),
 ('G', 'T'): (2, 7),
 ('T', 'A'): (3, 4),
 ('T', 'C'): (3, 5),
 ('T', 'G'): (3, 6),
 ('-', 'A'): 8,
 ('-', 'C'): 9,
 ('-', 'G'): 10,
 ('-', 'T'): 11,
 ('A', '-'): (0, 12),
 ('C', '-'): (1, 12),
 ('G', '-'): (2, 12),
 ('T', '-'): (3, 12)}

In [4]:
def make_alignment(df, maxlen=25):
    ref = Seq(df.iloc[0]['sequence'])
    ref = ref[::-1]
    alignments = []
    for seq in df['sequence']:
        alt = Seq(seq)
        alt = alt[::-1]
        # m: A match score is the score of identical chars, otherwise mismatch score
        # d: The sequences have different open and extend gap penalties.
        aln = pairwise2.align.localxd(ref, alt, -1, -0.1, -1, 0)
        if len(aln[0][0]) > maxlen: # increase gap open penalty to avoid too many gaps
            aln = pairwise2.align.localxd(ref, alt, -5, -0.1, -5, 0)
            if len(aln[0][0]) > maxlen:
                aln = [(ref, alt)]
        alignments.append(aln[0])
    return alignments


def featurize_alignment(alignments, maxlen=25):
    mats = []
    for j, aln in enumerate(alignments):
        fea = np.zeros((maxlen, 13))
        assert len(aln[0]) <= maxlen, "alignment {} larger than maxlen: {}".format(j, aln)
        for i in range(len(aln[0])):
            k = (aln[0][i], aln[1][i])
            fea[i, ltidx[k]] = 1
        mats.append(fea)
    return mats


In [6]:
# read
df1 = pd.read_csv("./data/sgRNA_1.csv")
df2 = pd.read_csv("./data/sgRNA_2.csv")
df1 = df1.dropna()
df2 = df2.dropna()

print("df1", df1.shape)
print("df2", df1.shape)

label_cols = ['wtCas9_ndABA', 'wtCas9_cleave_rate_log',
        'Cas9_enh_ndABA', 'Cas9_enh_cleave_rate_log',
        'Cas9_hypa_ndABA', 'Cas9_hypa_cleave_rate_log',
        'Cas9_HF1_ndABA', 'Cas9_HF1_cleave_rate_log',
        ]

# convert
mats1 = featurize_alignment(make_alignment(df1))
mats2 = featurize_alignment(make_alignment(df2))
label1 = df1[label_cols].to_numpy()
label2 = df2[label_cols].to_numpy()

mats = np.concatenate([np.array(mats1), np.array(mats2)])
labels = np.concatenate([label1, label2])

# save
np.save("data/compiled_X.npy", mats)
np.save("data/compiled_Y.npy", labels)
with open("data/x_col_annot.txt", 'w') as f:
    f.write("\n".join([str(x) for x in ltidx]))

with open("data/y_col_annot.txt", 'w') as f:
    f.write("\n".join(label_cols))
                                    

df1 (6988, 18)
df2 (6988, 18)
