In [1]:
import datetime
import itertools
import pandas as pd

%matplotlib inline

In [2]:
STATE_DD_8T3 = {
    'G': 'H',
    'H': 'H',
    'I': 'H',
    
    'B': 'E',
    'E': 'E',
    
    'T': 'C',
    'S': 'C',
    'C': 'C',
}

In [3]:
STANDARD_AAS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
NON_STANDARD_AAS = list('BJOUXZ')

def mask_unk_aa(seq):
    for i in NON_STANDARD_AAS:
        seq = seq.replace(i, '*')
    return seq

In [4]:
def convert_8t3_state(sst):
    out = sst
    for i, j in STATE_DD_8T3.items():
        out = out.replace(i, j)
    return out

In [5]:
datestamp = str(datetime.datetime.today().date())
datestamp

'2018-06-06'

In [6]:
%%time
adf = pd.read_csv('./raw_data/{0}-ss.csv'.format(datestamp))

CPU times: user 2.07 s, sys: 102 ms, total: 2.17 s
Wall time: 2.17 s


In [7]:
adf.head(1)

Unnamed: 0,pdb_id,chain_code,seq,sst
0,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...


# Check amino acid characters

In [8]:
%time all_chars = set(itertools.chain(*adf.seq.values))

CPU times: user 1.12 s, sys: 0 ns, total: 1.12 s
Wall time: 1.11 s


In [9]:
all_chars - set(STANDARD_AAS)

{'B', 'O', 'U', 'X', 'Z'}

In [10]:
set(STANDARD_AAS) - all_chars

set()

Mask these non-standard AAs with *

In [11]:
%time adf['seq'] = adf.seq.apply(mask_unk_aa).to_frame()

CPU times: user 416 ms, sys: 4.03 ms, total: 420 ms
Wall time: 419 ms


In [12]:
# make sure they are masked
assert set(itertools.chain(*adf.seq.values)) - set(STANDARD_AAS) == set(['*'])

In [13]:
adf.rename(columns={'sst': 'sst8'}, inplace=True)

# Create Q3

In [14]:
%time adf['sst3'] = adf['sst8'].apply(convert_8t3_state)

CPU times: user 979 ms, sys: 32.2 ms, total: 1.01 s
Wall time: 1.01 s


In [15]:
%time adf['len'] = adf.seq.apply(lambda s: len(s))

CPU times: user 160 ms, sys: 7.89 ms, total: 168 ms
Wall time: 155 ms


In [16]:
adf.head(1).T

Unnamed: 0,0
pdb_id,101M
chain_code,A
seq,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
sst8,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
sst3,CCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCHHHHHHCCC...
len,154


In [17]:
adf.sort_values(['len', 'pdb_id', 'chain_code'], inplace=True)

In [18]:
%time adf['has_nonstd_aa'] = adf.seq.apply(lambda s: '*' in s)

CPU times: user 84.5 ms, sys: 5 Âµs, total: 84.5 ms
Wall time: 82.7 ms


In [19]:
adf.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
377,1A30,C,EDL,CBC,CEC,3,False
2336,1B05,B,KCK,CBC,CEC,3,False
2369,1B0H,B,KAK,CBC,CEC,3,False
2419,1B1H,B,KFK,CBC,CEC,3,False
2473,1B2H,B,KAK,CBC,CEC,3,False


In [21]:
%%time
adf.to_csv('./{0}-ss.cleaned.csv'.format(datestamp), index=False)

CPU times: user 5.59 s, sys: 195 ms, total: 5.79 s
Wall time: 9.9 s
