In [1]:
!pwd

/home/nayeem/Desktop/MTZ/deepOffTarget/RNN Based Experiments


In [5]:
root_path = '/home/nayeem/Desktop/MTZ/deepOffTarget/RNN Based Experiments/'
data_dir = root_path + "Data/"
seqeunce_dir = data_dir + "Sequence/"
processed_dir = data_dir + "Processed/"

In [6]:
data_path = seqeunce_dir + 'all_off_target.csv'

In [7]:
import pandas as pd
df = pd.read_csv(data_path)
print(df.shape)
print(df.iloc[0])

(153233, 3)
Target sgRNA        AAATGAGAAGAAGAGGCACAGGG
Off Target sgRNA    GCATGAGAAGAAGAGACATAGCC
label                                     0
Name: 0, dtype: object


In [8]:
import numpy as np

def encoder(RNAseq, order=['A','T','C','G']):
    lookup_table = {order[0]:[1,0,0,0],
                    order[1]:[0,1,0,0],
                    order[2]:[0,0,1,0],
                    order[3]:[0,0,0,1]}
    encoded = np.zeros((len(RNAseq),len(order)))
    
    for i in range(len(RNAseq)):
        nu = RNAseq[i]
        if nu in lookup_table:
            encoded[i] = np.array(lookup_table[nu])
        else:
            print("Exception: Unindentified Nucleotide")
    
    return encoded

def decoder(encoded, order=['A','T','C','G']):
    RNAseq = ''

    for i in range(encoded.shape[0]):
        idx = np.where(encoded[i]==1)[0][0] #first occurance only
        RNAseq += order[idx]
    
    return RNAseq

def superpose(encoded1, encoded2):
    if(len(encoded1) != len(encoded2)):
        print("Size Mismatch")
        return encoded1
    
    superposed = np.zeros(encoded1.shape)

    for i in range(len(encoded1)):
        for j in range(len(encoded1[i])):
            if encoded1[i][j] == encoded2[i][j]:
                superposed[i][j] = encoded1[i][j]
            else:
                superposed[i][j] = encoded1[i][j] + encoded2[i][j]
    return superposed

def superposeWithDirection(encoded1, encoded2):
    if(len(encoded1) != len(encoded2)):
        print("Size Mismatch")
        return encoded1
    
    superposed = np.zeros((encoded1.shape[0],encoded1.shape[1]+1))

    for i in range(len(encoded1)):
        for j in range(len(encoded1[i])):
            if encoded1[i][j] == encoded2[i][j]:
                superposed[i][j] = encoded1[i][j]
            else:
                superposed[i][j] = encoded1[i][j] + encoded2[i][j]
                superposed[i][-1] = encoded1[i][j]
    return superposed

def testEncDec():
    sgRNA = 'ACTGGG'
    print("Original: ", sgRNA)
    print("Encoded:")
    encoded = encoder(sgRNA)
    print(encoded)
    decoded = decoder(encoded)
    print("Decoded: ",decoded)
    

def testSuperpose():
    sgRNA = "ACTGGG"
    DNA = "GCTGGC"
    print('sgRNA: ', sgRNA)
    print('DNA  : ', DNA)

    encoded1 = encoder(sgRNA)
    encoded2 = encoder(DNA)

    superposed = superpose(encoded1, encoded2)
    print(superposed)

def testSuperposeWithDirection():
    sgRNA = "GACTGGGC"
    DNA = "AGCTGGCG"
    print('sgRNA: ', sgRNA)
    print('DNA  : ', DNA)

    encoded1 = encoder(sgRNA)
    encoded2 = encoder(DNA)

    superposed = superposeWithDirection(encoded1, encoded2)
    print(superposed)

testEncDec()
print()
testSuperpose()
print()
testSuperposeWithDirection()

Original:  ACTGGG
Encoded:
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]
Decoded:  ACTGGG

sgRNA:  ACTGGG
DNA  :  GCTGGC
[[1. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 1.]]

sgRNA:  GACTGGGC
DNA  :  AGCTGGCG
[[1. 0. 0. 1. 1.]
 [1. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 1. 1.]
 [0. 0. 1. 1. 0.]]


In [9]:
enc_targets = []
enc_off_targets = []
enc_superposed = []
enc_superposed_with_dir = []
labels = []

for i in range(df.shape[0]):
    df_row = df.iloc[i]
    target = encoder(df_row['Target sgRNA'])
    off_target = encoder(df_row['Off Target sgRNA'])
    superposed = superpose(target, off_target)
    superposed_with_dir = superposeWithDirection(target, off_target)

    enc_targets.append(target)
    enc_off_targets.append(off_target)
    enc_superposed.append(superposed)
    enc_superposed_with_dir.append(superposed_with_dir)
    labels.append(df_row['label'])

    if i%1000 == 0:
      print(i+1,"/",df.shape[0],"done")

print(len(enc_targets))
print(len(enc_off_targets))
print(len(enc_superposed))
print(len(superposed_with_dir))
print(len(labels))    


1 / 153233 done
1001 / 153233 done
2001 / 153233 done
3001 / 153233 done
4001 / 153233 done
5001 / 153233 done
6001 / 153233 done
7001 / 153233 done
8001 / 153233 done
9001 / 153233 done
10001 / 153233 done
11001 / 153233 done
12001 / 153233 done
13001 / 153233 done
14001 / 153233 done
15001 / 153233 done
16001 / 153233 done
17001 / 153233 done
18001 / 153233 done
19001 / 153233 done
20001 / 153233 done
21001 / 153233 done
22001 / 153233 done
23001 / 153233 done
24001 / 153233 done
25001 / 153233 done
26001 / 153233 done
27001 / 153233 done
28001 / 153233 done
29001 / 153233 done
30001 / 153233 done
31001 / 153233 done
32001 / 153233 done
33001 / 153233 done
34001 / 153233 done
35001 / 153233 done
36001 / 153233 done
37001 / 153233 done
38001 / 153233 done
39001 / 153233 done
40001 / 153233 done
41001 / 153233 done
42001 / 153233 done
43001 / 153233 done
44001 / 153233 done
45001 / 153233 done
46001 / 153233 done
47001 / 153233 done
48001 / 153233 done
49001 / 153233 done
50001 / 15323

In [10]:
print(df.iloc[0])
print(enc_targets[0])
print(enc_off_targets[0])
print(enc_superposed[0])
print(enc_superposed_with_dir[0])
print(labels[0])  

Target sgRNA        AAATGAGAAGAAGAGGCACAGGG
Off Target sgRNA    GCATGAGAAGAAGAGACATAGCC
label                                     0
Name: 0, dtype: object
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]
[[0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
[[1. 0. 0. 1.]
 [1. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1

In [12]:
# !pip install pickle5

Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m735.4 kB/s[0m eta [36m0:00:00[0m1m816.6 kB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25ldone
[?25h  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-linux_x86_64.whl size=125858 sha256=61be376e59991c71dbd612b59a135cc99ded3bf02380cd89dc66af9460dea508
  Stored in directory: /home/nayeem/.cache/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [

In [13]:
import pickle5 as pkl

enc_dict ={"enc_targets":enc_targets,
           "enc_off_targets":enc_off_targets,
           "enc_superposed":enc_superposed,
           "enc_superposed_with_dir":enc_superposed_with_dir,
           "labels":labels}

out_file = root_path + "Encoded Data/all_encoded_data_new.pkl"

with open(out_file, "wb") as f:
    pkl.dump(enc_dict, f)