# Multi Layer Perceptron (MNIST) Pytorch

In [53]:
import sys
!{sys.executable} -m pip install xlrd

Collecting xlrd
  Using cached xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
Installing collected packages: xlrd
  Attempting uninstall: xlrd
    Found existing installation: xlrd 1.2.0
    Uninstalling xlrd-1.2.0:
      Successfully uninstalled xlrd-1.2.0
Successfully installed xlrd-2.0.1


In [1]:
import torch
import numpy as np
import pandas as pd
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AlbertConfig,
    AlbertForSequenceClassification,
    AlbertTokenizer,
    BertConfig,
    BertForSequenceClassification,
    BertForLongSequenceClassification,
    BertForLongSequenceClassificationCat,
    BertTokenizer,
    DNATokenizer,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    FlaubertConfig,
    FlaubertForSequenceClassification,
    FlaubertTokenizer,
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    XLMConfig,
    XLMForSequenceClassification,
    XLMRobertaConfig,
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizer,
    XLMTokenizer,
    XLNetConfig,
    XLNetForSequenceClassification,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
)

In [2]:
config_class, model_class, tokenizer_class=(BertConfig, BertForSequenceClassification, DNATokenizer)

In [3]:
 model = model_class.from_pretrained("/mnt/d/M3/Projects/BCB/DNABERT/examples/OUTPUT/pre/3mer/checkpoint-17000")

In [9]:
model(**{"input_ids": "ACATCAGGTTACCTCTACCAAGG" , "attention_mask":3})

AttributeError: 'str' object has no attribute 'size'

In [10]:
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [5]:
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 32
# percentage of training set to use as validation
valid_size = 0.2
# convert data to torch.FloatTensor
transform = transforms.ToTensor()
# choose the training and testing datasets
train_data_path = "/mnt/d/M3/Projects/BCB/DNABERT/examples/sample_data/ft/prom-core/3/train.tsv"
test_data_path = "/mnt/d/M3/Projects/BCB/DNABERT/examples/sample_data/ft/prom-core/3/dev.tsv"
train_data = pd.read_csv(train_data_path,sep="\t", header=None)
test_data = pd.read_csv(test_data_path,sep="\t", header=None)
train_data

Unnamed: 0,0,1
0,ACATCA GGTTAC CTCTAC CAAGG,1
1,CTGATG CCAGCT AGTGGG CGAGG,0
2,CTGTTT CCCATC CTTCCG GGTGG,1
3,AATGTA TGCACA GGGAAC AGAGG,0
4,CCAGAC TCACCC GCTTGC CCAGG,1
...,...,...
16744,CAACGC CCTGCT GCGGCG GCTGG,1
16745,CTAAGA AATCCT CTATCT TCAGG,0
16746,TGATCC GCCAGC GCCATA TCAGG,0
16747,ATCCGA GGTGGT ACCTGA TATGG,0


In [12]:
# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_index, valid_index = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_index)
valid_sampler = SubsetRandomSampler(valid_index)
# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, 
                                           sampler = train_sampler, num_workers = num_workers)
valid_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size,
                                          sampler = valid_sampler, num_workers = num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size,
                                         num_workers = num_workers)

In [11]:
import torch.nn as nn
import torch.nn.functional as F
# define NN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        # number of hidden nodes in each layer (512)
        hidden_1 = 512
        hidden_2 = 512
        # linear layer (784 -> hidden_1)
        self.fc1 = nn.Linear(28*28, 512)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(512,512)
        # linear layer (n_hidden -> 10)
        self.fc3 = nn.Linear(512,10)
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.droput = nn.Dropout(0.2)
        
    def forward(self,x):
        # flatten image input
        x = x.view(-1,28*28)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.droput(x)
         # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.droput(x)
        # add output layer
        x = self.fc3(x)
        return x
# initialize the NN
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=10, bias=True)
  (droput): Dropout(p=0.2, inplace=False)
)


# Labeled SGRNA maker

## df1

In [1]:
import pandas as pd
import numpy as np

In [41]:
df = pd.read_csv('examples/sample_data/ft/prom-core/3/train.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1
0,ACA TCA GGT TAC CTC TAC CAA GG,1
1,CTG ATG CCA GCT AGT GGG CGA GG,0
2,CTG TTT CCC ATC CTT CCG GGT GG,1
3,AAT GTA TGC ACA GGG AAC AGA GG,0
4,CCA GAC TCA CCC GCT TGC CCA GG,1
...,...,...
16744,CAA CGC CCT GCT GCG GCG GCT GG,1
16745,CTA AGA AAT CCT CTA TCT TCA GG,0
16746,TGA TCC GCC AGC GCC ATA TCA GG,0
16747,ATC CGA GGT GGT ACC TGA TAT GG,0


In [31]:
def space(string, mer=3):
    string = string.replace(" ", "")
    new_string=''
    k=23//mer if 23%mer==0 else 23//mer +1
    for i in range(k):
        if type(string)!=type(''): print(string)
        else: new_string+=string[i*mer:(i+1)*mer]+' ' if i!=k-1 else string[i*mer:(i+1)*mer]
    return new_string

In [43]:
string = 'GAGAACGGAAAGGAGAAGGGCGG'
space(string)

'GAG AAC GGA AAG GAG AAG GGC GG'

In [44]:
df[0]= df[0].apply(space)
df.to_csv('examples/sample_data/ft/prom-core/3/train.tsv', sep='\t',header=None,index=False)

In [50]:
import sys
!{sys.executable} -m pip install openpyxl

Collecting xlrd==1.2.0
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 508 kB/s eta 0:00:01
[?25hInstalling collected packages: xlrd
  Attempting uninstall: xlrd
    Found existing installation: xlrd 2.0.1
    Uninstalling xlrd-2.0.1:
      Successfully uninstalled xlrd-2.0.1
Successfully installed xlrd-1.2.0


In [135]:
df = pd.read_excel("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna.xlsx", engine='openpyxl')
df

Unnamed: 0,SGRNA,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,GAGTCGGGGTTTCGTCATGTTGG,,,,,
1,CGCCGCCGCTTTCGGTGATGAGG,,,,,
2,GGCAGCGTCGTGCACGGGTCGGG,,,,,
3,TGGGCGGATCACTTGACGTCAGG,,,,,
4,TTACCATAGTGTACGGGTGCAGG,,,,,
...,...,...,...,...,...,...
634846,,,,,,
634847,,,,,,
634848,,,,,,
634849,,,,,,


In [173]:
def fix_df1(df):
    df1=pd.DataFrame(df['SGRNA'].unique())
    # fix sgrna with length 21
    for i in range(371409):
        if len(df1[0][i])==21:
            df1[0][i]+="GG"
            
    #fix sgrna with lenght 20
    temp=df1[0].unique()
    temp=temp[0:-1] # last one is NaN
    missed = [ i for i in range(len(temp)) if len(temp[i])==20]
    x=temp[missed]
    for i in missed: # fix sequenc inplace with adding A
        temp[i]+="AGG"
        
    temp2 = np.append(temp, [[x+"CGG" for i in missed],[x+"TGG" for i in missed], [x+"GGG" for i in missed]]) # appending the rest
    
    #check improvements
    counter=0 
    for i in range(len(temp2)):
        if len(temp2[i])!=23:
            counter+=1
    print(counter)
    
    # find uniques
    df1=pd.DataFrame(temp2)
    df1=pd.DataFrame(df1[0].unique())
    
    #save
    df1.to_csv("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna0.csv", index = False, header = False)
    


### Test

In [136]:
df1=pd.DataFrame(df['SGRNA'].unique())
df1[0]

0         GAGTCGGGGTTTCGTCATGTTGG
1         CGCCGCCGCTTTCGGTGATGAGG
2         GGCAGCGTCGTGCACGGGTCGGG
3         TGGGCGGATCACTTGACGTCAGG
4         TTACCATAGTGTACGGGTGCAGG
                   ...           
371405    GTCGAGGAGTCAACTTCTTGTGG
371406    GCACCTGTTCGATGTACTTTTGG
371407    GCAGAATATTCGGACAAACACGG
371408    AGCAACAGAGCCTATCGGCATGG
371409                        NaN
Name: 0, Length: 371410, dtype: object

In [137]:
df1[0][371409]

nan

In [138]:
counter=0
for i in range(371409):
    if len(df1[0][i])==21:
        df1[0][i]+="GG"
        counter+=1
counter

59852

In [139]:
temp=df1[0].unique()
temp=temp[0:-1]
len(temp)

334005

In [143]:
df1=pd.DataFrame(temp)
df1.to_csv("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna0.csv", index = False, header = False)

In [140]:
counter=0
for i in range(len(temp)):
    if len(temp[i])!=23:
        counter+=1
counter

1040

In [141]:
missed = [ i for i in range(len(temp)) if len(temp[i])==20]

In [142]:
x=temp[missed]
x

array(['GGAATTGCGAATTATAAAAC', 'GGGAACTCCTCAAACACAGC',
       'GGAGACACTGCTGTTCTTTT', ..., 'GGCAACTTTTTATGCAAGTT',
       'GGTAGCTGGGACTTGCCCTT', 'GGACTTCCAGGACTGCCTTG'], dtype=object)

In [146]:
for i in missed:
    temp[i]+="AGG"

In [147]:
temp2 = np.append(temp, [[x+"CGG" for i in missed],[x+"TGG" for i in missed], [x+"GGG" for i in missed]])
len(temp2)

3578805

In [148]:
temp2

array(['GAGTCGGGGTTTCGTCATGTTGG', 'CGCCGCCGCTTTCGGTGATGAGG',
       'GGCAGCGTCGTGCACGGGTCGGG', ..., 'GGCAACTTTTTATGCAAGTTGGG',
       'GGTAGCTGGGACTTGCCCTTGGG', 'GGACTTCCAGGACTGCCTTGGGG'], dtype=object)

In [149]:
counter=0
for i in range(len(temp2)):
    if len(temp2[i])!=23:
        counter+=1
counter

0

In [150]:
df1=pd.DataFrame(temp2)
df1[0]

0          GAGTCGGGGTTTCGTCATGTTGG
1          CGCCGCCGCTTTCGGTGATGAGG
2          GGCAGCGTCGTGCACGGGTCGGG
3          TGGGCGGATCACTTGACGTCAGG
4          TTACCATAGTGTACGGGTGCAGG
                    ...           
3578800    GGCAGAAATGTGCTCTTGCAGGG
3578801    GGGAAGATTTTGGACTATTTGGG
3578802    GGCAACTTTTTATGCAAGTTGGG
3578803    GGTAGCTGGGACTTGCCCTTGGG
3578804    GGACTTCCAGGACTGCCTTGGGG
Name: 0, Length: 3578805, dtype: object

In [151]:
df1=pd.DataFrame(df1[0].unique())
df1[0]

0         GAGTCGGGGTTTCGTCATGTTGG
1         CGCCGCCGCTTTCGGTGATGAGG
2         GGCAGCGTCGTGCACGGGTCGGG
3         TGGGCGGATCACTTGACGTCAGG
4         TTACCATAGTGTACGGGTGCAGG
                   ...           
337120    GGCAGAAATGTGCTCTTGCAGGG
337121    GGGAAGATTTTGGACTATTTGGG
337122    GGCAACTTTTTATGCAAGTTGGG
337123    GGTAGCTGGGACTTGCCCTTGGG
337124    GGACTTCCAGGACTGCCTTGGGG
Name: 0, Length: 337125, dtype: object

In [152]:
df1.to_csv("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna0.csv", index = False, header = False)

# df2

In [2]:
col_list  = ["sequence", "cas"]
df = pd.read_csv("/mnt/d/M3/Projects/BCB/Crispr/Dataset/GenomeCRISPR_full05112017.csv/GenomeCRISPR_full05112017.csv", usecols=col_list)
df

Unnamed: 0,sequence,cas
0,GCAGCATCCCAACCAGGTGGAGG,hSpCas9
1,GCGGGAGTGAGAGGACTGGGCGG,hSpCas9
2,ATGACTCTCATACTCCACGAAGG,hSpCas9
3,GAGTCATCGAGCAGCTGCCATGG,hSpCas9
4,AGTCACCCTAGCAAAACCAGTGG,hSpCas9
...,...,...
38473695,CTGAAACATTTAACCAGTTGTGG,hSpCas9
38473696,GACCAATATACCCATAGCCTTGG,hSpCas9
38473697,GTTCTTCAACAGTCCACAACTGG,hSpCas9
38473698,TATTGGTCCCATACGATCTCAGG,hSpCas9


In [21]:
def combination(prefix=[''], allowed=['A','C','T','G'],length=3):
    newprefix=[]
    if length == 0:
        return prefix
    for i in allowed:
        for j in prefix:
            newprefix.append(j+i)
    return combination(prefix=newprefix, allowed=allowed, length=length-1)

def add(missed_data, missing=1):
    allowed = combination(prefix=[''], allowed=['A','C','T','G'],length=missing)
    need = []
    for i in allowed:
        need.append([i+x for x in missed_data])
    return need

def fix_length(array, length=20):
    
    if length < 23:
        # find missing data
        missed_index = [ i for i in range(len(array)) if len(array[i])==length]
        missed_data = array[missed_index]

        # remove missing data
        result = np.delete(array, missed_index)

        # fix missing data
        need = add(missed_data, missing = 23 - length)

#         print(need)
        result = np.append(result, need)

        # check fix
        counter=0
        for i in range(len(result)):
            if len(result[i])==length:
                counter+=1
#         print(counter)

        return np.unique(result)
    
    else:
        counter=0
        for i in range(len(array)):
            if len(array[i]) > 23:
                array[i] = array[i][-23:]
                counter+=1
#         print(counter)
        
        return np.unique(array)
        

def fix_df2(df):
    df2=pd.DataFrame(df['sequence'].unique())
    NaN=[]
    counter=0
    for i in range(747171):
        try:
            if len(df2[0][i])!=23:
                counter+=1
        except:
            NaN.append(i)
            
    df2=df2.drop(NaN)
    df2.index = range(len(df2))
    
    temp=df2[0].unique()
#     print(len(temp))
    temp = fix_length(temp,length=20)
    # print(len(temp))
    temp = fix_length(temp,length=21)
    # print(len(temp))
    temp = fix_length(temp,length=22)
#     print(len(temp))
    temp = fix_length(temp,length=24)
#     print(len(temp))
    df2=pd.DataFrame(temp)
    df2.columns= ["sgRNA"]
    return df2

    

In [3]:
df1=pd.DataFrame(pd.unique(df[['sequence', "cas"]].values.ravel('K')))
df1

Unnamed: 0,0
0,GCAGCATCCCAACCAGGTGGAGG
1,GCGGGAGTGAGAGGACTGGGCGG
2,ATGACTCTCATACTCCACGAAGG
3,GAGTCATCGAGCAGCTGCCATGG
4,AGTCACCCTAGCAAAACCAGTGG
...,...
747169,GCCAAGCCAGAGAAGACCCGTGG
747170,GCTCCTGAGCGAGATGGCGGCGG
747171,hSpCas9
747172,dCas9-VP64


In [3]:
df2=pd.DataFrame(df['sequence'].unique())
df2

Unnamed: 0,0
0,GCAGCATCCCAACCAGGTGGAGG
1,GCGGGAGTGAGAGGACTGGGCGG
2,ATGACTCTCATACTCCACGAAGG
3,GAGTCATCGAGCAGCTGCCATGG
4,AGTCACCCTAGCAAAACCAGTGG
...,...
747166,GCAGCCGCCCGGCCTCACCGCGG
747167,TGTGAAAGATGCCTCCGCCGCGG
747168,GCATCTTTCACAGAATCCGGCGG
747169,GCCAAGCCAGAGAAGACCCGTGG


In [4]:
counter=0
for i in range(747171):
    try:
        if len(df2[0][i])!=23:
            counter+=1
    except:
        print(i, end=", ")
counter

695153, 

83998

In [6]:
df2[0][695153]

nan

In [5]:
df2=df2.drop([695153])
df2.index = range(len(df2))
df2

Unnamed: 0,0
0,GCAGCATCCCAACCAGGTGGAGG
1,GCGGGAGTGAGAGGACTGGGCGG
2,ATGACTCTCATACTCCACGAAGG
3,GAGTCATCGAGCAGCTGCCATGG
4,AGTCACCCTAGCAAAACCAGTGG
...,...
747165,GCAGCCGCCCGGCCTCACCGCGG
747166,TGTGAAAGATGCCTCCGCCGCGG
747167,GCATCTTTCACAGAATCCGGCGG
747168,GCCAAGCCAGAGAAGACCCGTGG


In [51]:
myfix=[]
counter=0
for i in range(747170):
    if len(df2[0][i])==25:
#         df2[0][i]+="GG"
        myfix.append(df2[0][i])
        counter+=1
# df2=df2.drop(myfix)
# df2.index = range(len(df2))
counter

269

In [52]:
myfix

['CGGCCGCTGCTGCCGCTCCTTGTGG',
 'CCGCTGCTGCCGCTCCTTGTGGTGG',
 'TGGTGGCCGCGTGCGTCCTGCCCGG',
 'CCACGGCGGAGCGGAGAGCCCTCGG',
 'CGGAGCGGAGAGCCCTCGGCATCGG',
 'GAGCAAGCGGTGGAGACAGAGCCGG',
 'GCTGGGGATGCTGCGCTATCGGTGG',
 'AATATCACCTACAGGTCACCGTAGG',
 'TGCCGAAGCCCAGGACCGTGTTTGG',
 'CTTGCCCCGAGGCCCCGGACCTCGG',
 'CTGCTGCTGTTGTTGGCGCCGCCGG',
 'TAGAGCGAACACGAACCATCCAAGG',
 'GAGCCCGGGCGCTGGAGGCTCCTGG',
 'CTGCGGGCCCCGGCCGCTCGCCCGG',
 'CGGCTTCGGGACCGTACGCAAAAGG',
 'ACCACTGCACAGTGCGCGTGTCCGG',
 'GAAGTGGCTGAGTACCTGACCCCGG',
 'ACCTCAGAGCCCGTTTGGATACTGG',
 'CCAAGAGTAAGTTATTTGACGTTGG',
 'AAGAAGCTGAACGAGTATCGGCTGG',
 'TTTGCCCCTTTTAGTAGTGCCTTGG',
 'GAGCAGCTGCTAGCCTCGCCCAAGG',
 'GCCTGGGCACGCTGGCGCTGTGCGG',
 'AGGAAAGCGGCGCAGCTGCCCTGGG',
 'GTTGCTCTCCGGCGGCCTCCCTCGG',
 'TGTTCCGTCTTCCACCTGTTCGTGG',
 'AGCAGGACACGCAGGGCCTGGACGG',
 'GCTGGTGGCTCTGCTCGCTGCATGG',
 'CCTGGGAACCGCCTCAAGATCTTGG',
 'ATCCTGGACCGTGTCACCATCCAGG',
 'CCATTTATTGAAACTCCTCGCCAGG',
 'ACGTCCAACAACAGCACCATGCAGG',
 'CTCTGTGTGCTGGCCATATCGCTGG',
 'CTGGCCAT

In [16]:
temp=df2[0].unique()
len(temp)

747170

In [57]:
a="CGGCCGCTGCTGCCGCTCCTTGTGG"
len(a)
len(a[-23:])

23

In [18]:
myfix=[]
counter=0
for i in range(len(df2)):
    if len(df2[0][i])>23:
#         df2[0][i]+="GG"
        myfix.append(df2[0][i])
        counter+=1

counter

470

In [30]:
myfix

['ATCAGCCAGAGCCCACACGGGG',
 'AGCGGAGAGCCCTCGGCATCGG',
 'CTGGCTGTTCCCCGGGCTGTGG',
 'TTCCCCGGGCTGTGGCTGTTGG',
 'GCCAAGGTGGAGCAAGCGGTGG',
 'AAGAATGATGACCCTTCAAAGG',
 'TTTCTGGTGATCCAGCCCCTGG',
 'CCCCTGGTGATCTGGATGCCGG',
 'ATAGGTGATAGTTGGGAATGG',
 'GGTCTGTGATGTCACATCTAGG',
 'GACAGACATGTCTTCCCATGG',
 'TGATTGTCCAGGCTCGGCTTGG',
 'AATAACCAAATGCAGCGGAAGG',
 'CCACATCGTGCGGCAGCTGCGG',
 'ATTAATACTGTGAAGGGAAAGG',
 'GGGAAGAATTGAAAGTGAAGG',
 'TGCTTCGAGATGTGTGGTTTGG',
 'TGAAAAAGCACTTTCAGAAGG',
 'TGATTCACCCCCAGGGGAGG',
 'TCCAACAACAGCACCATGCAGG',
 'CTGGCCTCTATGGTGACCGAGG',
 'ACCTCTGACAGCACGTTCCTGG',
 'GATGAATTATTACAGACAAGGG',
 'GGGCTCCGCAGGGGGCACCCGG',
 'GGCTGTATTGAAGAGCAGGGGG',
 'CGATCCATCAAGACCACCCGGG',
 'GCGGCTCCTGCGGCGGTGCCGG',
 'GGCTGCTACGGGGCCATCCAGG',
 'CAGTTGGAGCAGTTTATTTTGG',
 'ACGGGCCCGCTCAGACCAATGG',
 'AAGCTGGAGCAGCTGCTCCTGG',
 'TACACCGAGTGCAGCCACTCGG',
 'GAGCGGCGGCTGCGCGCGCTGG',
 'GAGCAGCTGGCGCGGGGCGAGG',
 'GCGGCTGCCCGGGGCTCGACGG',
 'CTCAGCGTCACCTGGGTGGAGG',
 'GCGAGTGTCCACTGTGCACGGG',
 'TGCAA

In [202]:
temp=df2[0].unique()
temp=temp[0:-1]
len(temp)

747169

In [22]:
temp=df2[0].unique()
print(len(temp))
temp = fix_length(temp,length=20)
# print(len(temp))
temp = fix_length(temp,length=21)
# print(len(temp))
temp = fix_length(temp,length=22)
print(len(temp))
temp = fix_length(temp,length=24)
print(len(temp))

747170
965615
965459


In [24]:
myfix=[]
counter=0
for i in range(len(temp)):
    if len(temp[i])!=23:
#         df2[0][i]+="GG"
        myfix.append(temp[i])
        counter+=1

counter

0

In [25]:
df2=pd.DataFrame(temp)
df2.columns= ["sgRNA"]
df2

Unnamed: 0,sgRNA
0,AAAAAAAAAAAGGGGGGTTCTGG
1,AAAAAAAAAAAGTCGTACTTGGG
2,AAAAAAAAAAGAGCACGATGAGG
3,AAAAAAAAAATTCCACCCCCTGG
4,AAAAAAAAAATTGCCGGGCGAGG
...,...
965454,TTTTTTTTTGAGACGGAGTTAAG
965455,TTTTTTTTTGAGACGGAGTTCAG
965456,TTTTTTTTTGAGACGGAGTTCGG
965457,TTTTTTTTTGAGACGGAGTTTAG


In [26]:
df1 = pd.read_csv("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna0.csv")
df1.columns= ["sgRNA"]
df1

Unnamed: 0,sgRNA
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
337119,GGCAGAAATGTGCTCTTGCAGGG
337120,GGGAAGATTTTGGACTATTTGGG
337121,GGCAACTTTTTATGCAAGTTGGG
337122,GGTAGCTGGGACTTGCCCTTGGG


In [27]:
df3 = pd.concat([df1,df2], ignore_index=True)
df3

Unnamed: 0,sgRNA
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
1302578,TTTTTTTTTGAGACGGAGTTAAG
1302579,TTTTTTTTTGAGACGGAGTTCAG
1302580,TTTTTTTTTGAGACGGAGTTCGG
1302581,TTTTTTTTTGAGACGGAGTTTAG


In [28]:
df4=pd.DataFrame(df3['sgRNA'].unique())
df4

Unnamed: 0,0
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
1066619,TTTTTTTTTGAGACGGAGTTAAG
1066620,TTTTTTTTTGAGACGGAGTTCAG
1066621,TTTTTTTTTGAGACGGAGTTCGG
1066622,TTTTTTTTTGAGACGGAGTTTAG


In [29]:
df4.to_csv("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna.csv", index = False, header = True)

In [1]:
with open("/mnt/d/M3/Projects/BCB/Crispr/Dataset/unlabeled_sgrna.txt","w") as f:
    for i in df4[0]:
        f.write(space(i, mer=3)+"\n")

NameError: name 'df4' is not defined