In [None]:
# default_exp preprocessing.data

# Data

> Classes and functions to load dataset, clean and save for further processing & vocab creation.

In [None]:
#export
from peptide.basics import *
import pandas as pd
import os

## Analyze Data
- Load all 3 datasets
- Merge into single df?
- Look for class imbalance
- Clean
- Split

In [None]:
DATA_STORE

'/Users/Vinod/.peptide/datasets'

In [None]:
os.listdir(f'{DATA_STORE}')

['.DS_Store', 'dna_binding', 'amp', 'acp']

### Anti Cancer Peptide Dataset (ACP)

In [None]:
os.listdir(f'{DATA_STORE}/acp')

['train_data.csv', 'test_data.csv']

In [None]:
raw_acp_train_df = pd.read_csv(f'{DATA_STORE}/acp/train_data.csv')
raw_acp_test_df = pd.read_csv(f'{DATA_STORE}/acp/test_data.csv')


In [None]:
for df in [raw_acp_train_df, raw_acp_test_df]:
    display(df.head(5))

Unnamed: 0,sequences,label
0,RRWWRRWRRW,0
1,GWKSVFRKAKKVGKTVGGLALDHYLG,0
2,ALWKTMLKKLGTMALHAGKAALGAAADTISQGTQ,1
3,GLFDVIKKVAAVIGGL,1
4,VAKLLAKLAKKVL,1


Unnamed: 0,sequences,label
0,FLPLLLSALPSFLCLVFKKC,0
1,DKLIGSCVWLAVNYTSNCNAECKRRGYKGGHCGSFLNVNCWCET,0
2,AVKDTYSCFIMRGKCRHECHDFEKPIGFCTKLNANCYM,0
3,GLPTCGETCFGGTCNTPGCTCDPWPVCTHN,1
4,ENCGRQAG,0


In [None]:
for df in [raw_acp_train_df, raw_acp_test_df]:
    display(df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1378.0,0.5,0.500182,0.0,0.0,0.5,1.0,1.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,344.0,0.5,0.500728,0.0,0.0,0.5,1.0,1.0


In [None]:
print(f'Train: {raw_acp_train_df.label.sum() / len(raw_acp_train_df) : .2%}')
print(f'Test: {raw_acp_test_df.label.sum() / len(raw_acp_test_df) : .2%}')

Train:  50.00%
Test:  50.00%


Class split is 50 - 50

In [None]:
len(raw_acp_test_df) / (len(raw_acp_train_df) + len(raw_acp_test_df))

0.1997677119628339

Train / Test split in the total dataset 
- Test ~ 20%
- Train ~ 80%

In [None]:
# export

def get_acp_data():
    '''Load, clean and return ACP train and test dataframes'''
    
    acp_train_df = pd.read_csv(f'{DATA_STORE}/acp/train_data.csv')
    acp_test_df = pd.read_csv(f'{DATA_STORE}/acp/test_data.csv')

    acp_train_df.rename(columns={'sequences': 'sequence', 'label': 'label_acp'}, inplace=True)
    acp_test_df.rename(columns={'sequences': 'sequence', 'label': 'label_acp'}, inplace=True)

    return acp_train_df, acp_test_df


### Antimicrobial Peptide Dataset (AMP)

In [None]:
os.listdir(f'{DATA_STORE}/amp')

['all_data.csv']

In [None]:
raw_amp_df = pd.read_csv(f'{DATA_STORE}/amp/all_data.csv')


In [None]:
raw_amp_df.head(5)

Unnamed: 0,PDBs_code,SequenceID,label
0,AP02484,GMASKAGSVLGKITKIALGAL,1
1,AP02630,NIGLFTSTCFSSQCFSSKCFTDTCFSSNCFTGRHQCGYTHGSC,1
2,AP01427,GAIKDALKGAAKTVAVELLKKAQCKLEKTC,1
3,AP02983,FFGRLKAVFRGARQGWKEHRY,1
4,AP01815,DFGCARGMIFVCMRRCARMYPGSTGYCQGFRCMCDTMIPIRRPPFIMG,1


In [None]:
raw_amp_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,4042.0,0.5,0.500062,0.0,0.0,0.5,1.0,1.0


In [None]:
raw_amp_df.label.sum() / len(raw_amp_df)

0.5

In [None]:
# export 

def get_amp_data(test_pct=0.2, seed=1234):
    '''Load, clean, split and return AMP train and test dataframes'''
    
    amp_df = pd.read_csv(f'{DATA_STORE}/amp/all_data.csv')

    amp_df.drop(columns=['PDBs_code'], inplace=True)
    amp_df.rename(columns={'SequenceID': 'sequence', 'label': 'label_amp'}, inplace=True)

    amp_test_df = amp_df.sample(frac=test_pct, random_state=seed)
    amp_train_df = amp_df.drop(amp_test_df.index)

    return amp_train_df, amp_test_df


### DNA-Binding Protein Dataset

In [None]:
os.listdir(f'{DATA_STORE}/dna_binding')

['test.csv', 'train.csv']

In [None]:
raw_dnab_train_df = pd.read_csv(f'{DATA_STORE}/dna_binding/train.csv')
raw_dnab_test_df = pd.read_csv(f'{DATA_STORE}/dna_binding/test.csv')


In [None]:
for df in [raw_dnab_train_df, raw_dnab_test_df]:
    display(df.head(5))

Unnamed: 0,code,sequence,label,origin
0,Q6A8L0,MSGHSKWATTKHKKAAIDAKRGKLFARLIKNIEVAARLGGGDPSGN...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
1,Q7V7T9,MIGWLQGQKVEAWQQGTRQGVVLACAGVGYEVQIAPRHLSEMEHGQ...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2,Q9ZUP2,MARILRNVYSLRSSLFSSELLRRSVVGTSFQLRGFAAKAKKKSKSD...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
3,Q2JVG1,MKCPRCGKQEIRVLESRSAEGGQSVRRRRECMSCGYRFTTYERIEF...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
4,Q9K4Q3,MTKADIIEGVYEKVGFSKKESAEIVELVFDTLKETLERGDKIKISG...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


Unnamed: 0,code,sequence,label,origin
0,P27204|1,AKKRSRSRKRSASRKRSRSRKRSASKKSSKKHVRKALAAGMKNHLL...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
1,P53528|1,MVMVVNPLTAGLDDEQREAVLAPRGPVCVLAGAGTGKTRTITHRIA...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2,P52684|1,MKDDINQEITFRKLSVFMMFMAKGNIARTAEAMKLSSVSVHRALHT...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
3,P10961|1,MNNAANTGTTNESNVSDAPRIEPLPSLNDDDIEKILQPNDIFTTDR...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
4,P06023|1,MAKPAKRIKSAAAAYVPQNRDAVITDIKRIGDLQREASRLETEMND...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


In [None]:
for df in [raw_dnab_train_df, raw_dnab_test_df]:
    display(df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,14189.0,0.502431,0.500012,0.0,0.0,1.0,1.0,1.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,2272.0,0.507482,0.500054,0.0,0.0,1.0,1.0,1.0


In [None]:
print(f'Train: {raw_dnab_train_df.label.sum() / len(raw_dnab_train_df) : .2%}')
print(f'Test: {raw_dnab_test_df.label.sum() / len(raw_dnab_test_df) : .2%}')

Train:  50.24%
Test:  50.75%


Class split is 50 - 50

In [None]:
len(raw_dnab_test_df) / (len(raw_dnab_train_df) + len(raw_dnab_test_df))

0.1380232063665634

Train / Test split in the total dataset 
- Test ~ 14%
- Train ~ 86%

In [None]:
# export

def get_dna_bind_data():
    '''Load, clean and return ACP train and test dataframes'''
    
    dna_bind_train_df = pd.read_csv(f'{DATA_STORE}/dna_binding/train.csv')
    dna_bind_test_df = pd.read_csv(f'{DATA_STORE}/dna_binding/test.csv')

    dna_bind_train_df.drop(columns=['code', 'origin'], inplace=True)
    dna_bind_test_df.drop(columns=['code', 'origin'], inplace=True)

    dna_bind_train_df.rename(columns={'label': 'label_dna_bind'}, inplace=True)
    dna_bind_test_df.rename(columns={'label': 'label_dna_bind'}, inplace=True)

    return dna_bind_train_df, dna_bind_test_df


### Get All Data

- Load, clean, split all 3 datasets
    - Clean = retain only 2 columns in all 3 dfs - `sequence` and `label`
    - Split AMP data set into train (80%) and test (20%)
- Optionally merge all datasets a single train / test

In [None]:
# export

def get_all_data(test_pct=0.2, seed=1234, merge=False):
    '''Load, clean, split, maybe merge and return all datasets'''

    acp_train_df, acp_test_df = get_acp_data()
    amp_train_df, amp_test_df = get_amp_data(test_pct=test_pct, seed=seed)
    dna_bind_train_df, dna_bind_test_df = get_dna_bind_data()

    if merge:
        merged_train_df = pd.concat([acp_train_df, amp_train_df, dna_bind_train_df], ignore_index=True).fillna(0)
        merged_test_df = pd.concat([acp_test_df, amp_test_df, dna_bind_test_df], ignore_index=True).fillna(0)
        return [merged_train_df, merged_test_df]
    else:
        return[acp_train_df, acp_test_df, amp_train_df, amp_test_df, dna_bind_train_df, dna_bind_test_df]
    


Test everything above

In [None]:
# func

all_dfs = get_all_data()

for df in all_dfs:
    assert len(df.columns) == 2

acp_train_df, acp_test_df, amp_train_df, amp_test_df, dna_bind_train_df, dna_bind_test_df = all_dfs

merged_train_df, merged_test_df = get_all_data(merge=True)
assert len(merged_train_df == len(acp_train_df) + len(amp_train_df) + len(dna_bind_train_df))
assert len(merged_test_df == len(acp_test_df) + len(amp_test_df) + len(dna_bind_test_df))

In [None]:
merged_test_df.head()

Unnamed: 0,sequence,label_acp,label_amp,label_dna_bind
0,FLPLLLSALPSFLCLVFKKC,0.0,0.0,0.0
1,DKLIGSCVWLAVNYTSNCNAECKRRGYKGGHCGSFLNVNCWCET,0.0,0.0,0.0
2,AVKDTYSCFIMRGKCRHECHDFEKPIGFCTKLNANCYM,0.0,0.0,0.0
3,GLPTCGETCFGGTCNTPGCTCDPWPVCTHN,1.0,0.0,0.0
4,ENCGRQAG,0.0,0.0,0.0


## Export -

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_basics.ipynb.
Converted 01_preprocessing_data.ipynb.
Converted 02_preprocessing_embedding.ipynb.
Converted 03_metrics.ipynb.
Converted index.ipynb.
