In [1]:
import pandas as pd
from motif.motif_utils import seq2kmer
# importing os module  
import os 

# Health Check

In [4]:
def check_length(df,name):
    not_23=[i for i in df.index if len(df[name][i])!=23]
    return not_23        
def check_end_GG(df,name):
    not_gg=[i for i in df.index if df[name][i][-2:]!='GG']
    return not_gg

def health_check(df,name):
    not_23=check_length(df,name)
    not_gg=check_end_GG(df,name)
    print("number of sequences not ending in GG:", len(not_gg))
    print("number of sequences not 23 nucleotide:", len(not_23))
    merge= not_23[:].copy()
    merge.extend(not_gg[:].copy())
    merge=set(merge)
    print("Problematic data:", len(merge))
    return not_gg, not_23

# dataframe to dnabert kmer

In [2]:
def mkdir(path):
    try: 
        os.mkdir(path) 
    except OSError as error: 
        print(error)
        
def df2kmer(df,sgrna_name,k):
    kmers=[]
    seqs=df[sgrna_name].to_list()
    for seq in seqs:
        kmers.append(seq2kmer(seq,k))
    return kmers

def add_allkmers_to_df(df, sgrna_name):
    for k in [3,4,5,6]:
        df[str(k)+'mer']=df2kmer(df, sgrna_name,k)
        
    return df

def save_tsv(df,split:int,name:str):
    '''
    For making tsv files for finetuning
    df --> pandas dataframe after adding kmers
    split --> int for train/val split
    name --> str path to the folder
    '''
    split_index=int(len(df.index)*split)
    for k in [3,4,5,6]:
        path=name+str(k)+'//'
        kmer_name=str(k)+'mer'
        mkdir(path)
        train = pd.DataFrame({"sequence":df[kmer_name][:split_index],"label":df["label"][:split_index]})
        dev = pd.DataFrame({"sequence":df[kmer_name][split_index:],"label":df["label"][split_index:]})
        
        train.to_csv(path+"train.tsv", sep = '\t', index=False)
        dev.to_csv(path+"dev.tsv", sep = '\t', index=False)

def save_txt(df,name:str):
    '''
    For making tsv files for finetuning
    df --> pandas dataframe after adding kmers
    name --> str path to the folder
    '''
    for k in [3,4,5,6]:
        path=name+str(k)+'_mer.txt'
        kmer_name=str(k)+'mer'
        with open(path, 'w') as f:
            for item in df[kmer_name]:
                f.write("%s\n" % item)
        
    

# Make train.tsv and dev.tsv

In [86]:
df = pd.read_csv('examples/unprocessed_data/labeled_sgrna.csv')
# drop duplicates (it doesn't have any)
df.drop_duplicates(subset='sgRNA', keep="last")
not_gg, not_23=health_check(df,'sgRNA')
df

number of sequences not ending in GG: 0
number of sequences not 23 nucleotide: 0
Problematic data: 0


Unnamed: 0,sgRNA,label
0,CTTGCTCGCGCAGGACGAGGCGG,1
1,ACATCAGGTTACCTCTACCAAGG,1
2,CTGATGCCAGCTAGTGGGCGAGG,0
3,CTGTTTCCCATCCTTCCGGGTGG,1
4,AATGTATGCACAGGGAACAGAGG,1
...,...,...
16744,CAACGCCCTGCTGCGGCGGCTGG,1
16745,CTAAGAAATCCTCTATCTTCAGG,0
16746,TGATCCGCCAGCGCCATATCAGG,0
16747,ATCCGAGGTGGTACCTGATATGG,0


In [43]:
df = add_allkmers_to_df(df, "sgRNA")
save_tsv(df,split=0.85,name='examples/unprocessed_data/ft/')

# Make k_mer.txt for pretraining

### Fix unlabeled sgrna

In [5]:
df = pd.read_csv('examples/unprocessed_data/unlabeled_sgrna.csv')
# drop duplicates (it doesn't have any)
df.drop_duplicates(subset='0', keep="last")
not_gg, not_23=health_check(df,name='0')
df

number of sequences not ending in GG: 42779
number of sequences not 23 nucleotide: 0
Problematic data: 42779


Unnamed: 0,0
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
1066619,TTTTTTTTTGAGACGGAGTTAAG
1066620,TTTTTTTTTGAGACGGAGTTCAG
1066621,TTTTTTTTTGAGACGGAGTTCGG
1066622,TTTTTTTTTGAGACGGAGTTTAG


In [6]:
fixed_df=df.drop(labels=not_gg)
not_gg, not_23=health_check(fixed_df,name='0')
fixed_df

number of sequences not ending in GG: 0
number of sequences not 23 nucleotide: 0
Problematic data: 0


Unnamed: 0,0
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
1066614,TTTTTTTTTGAGACGGAGTCCGG
1066616,TTTTTTTTTGAGACGGAGTCGGG
1066618,TTTTTTTTTGAGACGGAGTCTGG
1066621,TTTTTTTTTGAGACGGAGTTCGG


In [7]:
fixed_df.rename(columns={'0':'sgRNA'}, inplace=True)
fixed_df.to_csv('examples/unprocessed_data/unlabeled_sgrna_fixed.csv', index=False)
fixed_df

Unnamed: 0,sgRNA
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
1066614,TTTTTTTTTGAGACGGAGTCCGG
1066616,TTTTTTTTTGAGACGGAGTCGGG
1066618,TTTTTTTTTGAGACGGAGTCTGG
1066621,TTTTTTTTTGAGACGGAGTTCGG


### Making .txt files

In [13]:
df= pd.read_csv('examples/unprocessed_data/unlabeled_sgrna_fixed.csv')
# drop duplicates (it doesn't have any)
df.drop_duplicates(subset='sgRNA', keep="last")
not_gg, not_23=health_check(df,name='sgRNA')
df

number of sequences not ending in GG: 0
number of sequences not 23 nucleotide: 0
Problematic data: 0


Unnamed: 0,sgRNA
0,CGCCGCCGCTTTCGGTGATGAGG
1,GGCAGCGTCGTGCACGGGTCGGG
2,TGGGCGGATCACTTGACGTCAGG
3,TTACCATAGTGTACGGGTGCAGG
4,TCTACTGAAGTGGTAGCAACAGG
...,...
1023840,TTTTTTTTTGAGACGGAGTCCGG
1023841,TTTTTTTTTGAGACGGAGTCGGG
1023842,TTTTTTTTTGAGACGGAGTCTGG
1023843,TTTTTTTTTGAGACGGAGTTCGG


In [9]:
df = add_allkmers_to_df(df, "sgRNA")
save_txt(df,name='examples/unprocessed_data/pre/')

# Test

In [10]:
seq1=df['3mer'][0]
seq2=df['4mer'][0]
seq3=df['5mer'][0]
seq4=df['6mer'][0]
print("3mer","4mer","5mer","6mer")
print(len(seq1),"  ",len(seq2)," ",len(seq3),"",len(seq4))

3mer 4mer 5mer 6mer
83    99   113  125
