In [1]:
import os, re
import pandas as pd
import numpy as np

In [2]:
## read blastp tables
df = pd.concat([pd.read_csv('less30.APD.blastp', sep='\t', header=None), 
                pd.read_csv('more30.APD.blastp', sep='\t', header=None)], ignore_index=True)
col_names = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']
df.columns = col_names

## calculate query_coverage
df['qcoverage'] = (df['qend']-df['qstart']+1)/df['qlen']

In [3]:
## remove opengaps > 0
df = df[df.gapopen==0]

In [4]:
## remove mismatches
df['mis_count'] = df['mismatch'] + (df['qlen']-(df['qend']-df['qstart']+1))
df['flag'] = [ 1 if (row.qlen<=10 and row.mis_count<=1) or (row.qlen>10 and row.mis_count<=2) else 0 for _, row in df.iterrows() ]
df = df[df.flag==1]

In [5]:
## remove Repeats (multi-hits on same query-target pairs)
repeat_pairs = []
for (qid, sid), mtx in df.groupby(by=['qseqid', 'sseqid']):
    if mtx.shape[0] > 1:
        repeat_pairs.append((qid, sid))
        print('Multi-hits on %s-%s pair' % (qid, sid))

if len(repeat_pairs) > 1:
    for (qid, sid) in repeat_pairs:
        df = df[~((df.qseqid==qid) & (df.sseqid==sid))]

In [6]:
## output table
df.to_csv('APD.blastp.rmrep.tsv', sep='\t', index=False)

## CD-HIT & dataset preparation

In [7]:
uniprot = pd.read_csv('UniProt/cdhit-2d/all_300.APD.rmdup90.tab',sep='\t',header=None)
uniprot.columns = ['ID','Seq']
uniprot_dict = dict( zip( uniprot['ID'].tolist(), uniprot['Seq'].tolist() ) )

In [8]:
df = pd.read_csv('APD.blastp.rmrep.tsv', sep='\t')
df['sseqid'] = [ x.split('|')[1] for x in df['sseqid'].tolist() ]

df = df[df.sseqid.isin(uniprot_dict.keys())]

In [10]:
# del df['xx']
del df['flag']

In [11]:
df = df.sort_values(by=['sseqid','sstart','send'], ascending=[True,True,False])
df.to_csv('APD.blastp.rmrep90.tsv', sep='\t', index=False)

In [14]:
def get_token_labels(id, mtx):
    rmdup_mtx = pd.DataFrame(columns=mtx.columns)
    
    mtx = mtx.sort_values(by=['sstart','send'],ascending=[True,True])
    slen = mtx.slen.tolist()[0]
    a = np.zeros(slen)
    
    sstarts = mtx.sstart.tolist()
    sends = mtx.send.tolist()
    min_s, min_e = sstarts[0], sends[0]
    rmdup_mtx.loc[len(rmdup_mtx.index)] = mtx.iloc[0]
    
    if len(sstarts) > 1:
        for i in range(1, len(sstarts)):
            if sstarts[i] > min_e:
                rmdup_mtx.loc[len(rmdup_mtx.index)] = mtx.iloc[i]
                for j in range(min_s-1, min_e):
                    a[j] = 1.0
                min_s = sstarts[i]
                min_e = sends[i]
            else:
                continue
    
    for j in range(min_s-1, min_e):
        a[j] = 1.0
    
    return rmdup_mtx, a
                      

df_rmdup = pd.DataFrame(columns=df.columns)
dataset = pd.DataFrame(columns=['Class','ProId','Sequence'])
for sid, mtx in df.groupby(by='sseqid'):
    mtx_rmdup, a = get_token_labels(id, mtx)
    df_rmdup = pd.concat([df_rmdup, mtx_rmdup], ignore_index=True)
    np.save(os.path.join('labels', sid+'.npy'), a)
    dataset.loc[len(dataset.index)] = [1, sid, uniprot_dict[sid]]
    
df_rmdup.to_csv('APD.blastp.rmrep90.rmdup.tsv',sep='\t',index=False)
dataset.to_csv('dataset/test.csv', sep=',',index=False)
    