## Download datasets from torchDrug

In [2]:
import os
import requests
import tarfile
import pandas as pd
import lmdb 
import pickle

[link torch protein benchmarks results](https://torchprotein.ai/benchmark)

[link torch protein benchmarks datasets](https://torchdrug.ai/docs/api/datasets.html#betalactamase)

In [12]:
def download_extract_file(url, output_dir='./'):
    # read the file from the url
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        file_name = url.split("/")[-1]
        with open(output_dir + file_name, "wb") as file:
            file.write(response.raw.read())

    # Extract the tar.gz file
    with tarfile.open(output_dir+file_name, "r:gz") as tar:
        tar.extractall(path=output_dir)

    # Clean up the tar.gz file if needed
    os.remove(output_dir + file_name)

In [6]:
url1 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/beta_lactamase.tar.gz"

url2 = "http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/stability.tar.gz"

url3 = "http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/fluorescence.tar.gz"

url4 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/solubility.tar.gz"

url5 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz"

url6 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization.tar.gz"


In [17]:
for i in range(1,7):
    print('Downloading: ',  eval(f'url{i}'))
    download_extract_file(eval(f'url{i}'), output_dir='data/benchmarks/')

Downloading:  http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/stability.tar.gz
Downloading:  http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/fluorescence.tar.gz
Downloading:  https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/solubility.tar.gz
Downloading:  https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz
Downloading:  https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization.tar.gz


## Exctract

In [4]:
def load_data_from_lmdb(data_path):
    df = pd.DataFrame()
    with lmdb.open(data_path, readonly=True).begin(write=False) as txn:
        cursor = txn.cursor()
        for key, value in cursor:
            data_pkl = txn.get(key)

            if data_pkl is not None:
                data = pickle.loads(data_pkl)

                if isinstance(data, dict):
                    key = key.decode('utf-8')
                    # Decode the binary data if necessary
                    for k, v in data.items():
                        if isinstance(v, bytes):
                            data[k] = v.decode('utf-8')
                    df = pd.concat([df, pd.DataFrame(data, index=[key])], axis=0)
    
    df = df.reset_index(drop=False).rename(columns={'index': 'ID', 'primary': 'sequence'})
    df['ID'] = ['mut_'+str(i) for i in df['ID']]
    return df

In [5]:
def save_fasta(df, output_file):
    with open(output_file, 'w') as f:
        for i, row in df.iterrows():
            f.write(f'>{row["ID"]}\n')
            f.write(f'{row["sequence"]}\n')

## Beta lactamase benchmark

In [4]:
lmdb_dir = f'data/benchmarks/beta_lactamase/beta_lactamase_valid.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,scaled_effect1
0,mut_0,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.884452
1,mut_1,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1.020624
2,mut_10,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.740940
3,mut_100,MSIQHFRVALIPFFAAFCLPVFAHPETLNKVKDAEDQLGARVGYIE...,1.055183
4,mut_101,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.999580
...,...,...,...
515,mut_95,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,-0.008031
516,mut_96,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.927595
517,mut_97,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1.033065
518,mut_98,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1.003204


In [5]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/beta_lactamase/beta_lactamase_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/beta_lactamase/beta_lactamase_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/beta_lactamase/beta_lactamase_{split}_data.fasta')

## Fluorescence

In [6]:
lmdb_dir = f'data/benchmarks/fluorescence/fluorescence_valid.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,protein_length,log_fluorescence,num_mutations
0,mut_0,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.552324,3
1,mut_1,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.689570,3
2,mut_10,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,1.299777,3
3,mut_100,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,1.301031,2
4,mut_1000,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.569029,3
...,...,...,...,...,...
5357,mut_995,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.569963,3
5358,mut_996,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.921040,2
5359,mut_997,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.691537,2
5360,mut_998,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLI...,237,3.689303,2


In [7]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/fluorescence/fluorescence_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/fluorescence/fluorescence_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/fluorescence/fluorescence_{split}_data.fasta')

# Subcellular localizations binary

In [8]:
lmdb_dir = f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_test.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,localization
0,mut_0,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,0
1,mut_1,AGFPEQEPEPKFWNDWAQKTLDKALSLQTLNKNKAQNLILFLGDGM...,0
2,mut_10,MEDEAVLDRGASFLKHVCDEEEVEGHHTIYIGVHVPKSYRRRRRHK...,0
3,mut_100,MEHHNSHLLPGGSEKMYYIAHQQPMLRNEDDNYQEGYFIRPDPASL...,0
4,mut_1000,MARISCDLRFLLIPAAFMFIYIQMRLFQTQSQYADRLSSAIESENH...,0
...,...,...,...
1744,mut_995,MDKTKMFSAINLGVGGIFVLSGFIKLFSFSFVNALLALFIIVFGLG...,0
1745,mut_996,MFAPRLLDFQKTKYARFMNHRVPAHKRYQPTEYEHAANCATHAFWI...,0
1746,mut_997,MHPALLCGPILAIFLQFLVSSCSPLENDDLFLVQVEPEVDPVVAAE...,0
1747,mut_998,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...,0


In [9]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_{split}_data.fasta')

## Subcellular localizations

In [10]:
lmdb_dir = f'data/benchmarks/subcellular_localization/subcellular_localization_train.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,localization
0,mut_0,MEFRGSGATAVEQHLLQSETPGKNGLQATSSDQVGRTLRWFTTVVL...,0
1,mut_1,MKNSTAASSRWTKSRLSHFFPSYTNSSGMGAASTDQSSTQGEELHH...,0
2,mut_10,MAAMLMQPWPPFLPHLTLVFLTLILFFPNQSFSQSDSPRNIETFFP...,0
3,mut_100,MARGWVRPSRVPLCARAVWTAAALLLWTPWTAGEVEDSEAIDTLGQ...,0
4,mut_1000,MTAEEMKAAENGAQSAPLPLEGVDISPKQDEGVLKVIKREGTGTET...,1
...,...,...,...
8415,mut_995,MASEVIALCHSFEQELAKSLNVLPPVSASKPDAHDAHLNHHRLSQR...,1
8416,mut_996,MRIYQCHFCSSPCYPGHGIMFVRNDAKEFRFCRSKCHKAFKQRRNP...,1
8417,mut_997,MASRQGFSNVNEDEPELPPSVLSLKSKFESLSTGDLTNLDEKTAKR...,1
8418,mut_998,MGDTAKPYFVKRTKDRGTMDDDDFRRGHPQQDYLIIDDHAKGHGSK...,1


In [11]:
meta_data['length'] = meta_data['sequence'].apply(len)
meta_data['length'].describe()

count    8420.000000
mean      470.139430
std       277.797598
min        40.000000
25%       249.000000
50%       423.000000
75%       643.000000
max      1000.000000
Name: length, dtype: float64

In [12]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/subcellular_localization/subcellular_localization_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/subcellular_localization/subcellular_localization_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/subcellular_localization/subcellular_localization_{split}_data.fasta')

# Solubility

In [10]:
lmdb_dir = f'data/benchmarks/solubility/solubility_train.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,solubility
0,mut_0,GMILKTNLFGHTYQFKSITDVLAKANEEKSGDRLAGVAAESAEERV...,1
1,mut_1,MAHHHHHHMSFFRMKRRLNFVVKRGIEELWENSFLDNNVDMKKIEY...,0
2,mut_10,MSLSHGKGTDMLPEIAAAVGFLSSLLRTRGCVSEQRLKVFSGALQE...,0
3,mut_100,MSSSTSSVESVEDESCSNECSASFTFDTNNNSRGNNQVNELAEETH...,0
4,mut_1000,SRILVSIGESFGTSEKFQKINQMVCNSDRVLKRSAEGSNPPKPLKK,1
...,...,...,...
62473,mut_9995,ASTTTPTLRLNWLQPPFNNQKVRQALLHAVSQRDYMDAQVGDPKAY...,0
62474,mut_9996,MAVPEGNSWTYTAASASITAPAQLVGNVGELQGAGSAVIWNVDVPV...,1
62475,mut_9997,MGSDKIHHHHHHMMAMITDPDDFFTKGCGRCARFDTPDCSTRPWID...,0
62476,mut_9998,AHQTKVDTLIVTNSKAWKPFSYVSQDGEPKGILIDFWREYAERNHV...,0


In [11]:
meta_data['protein_length'] = meta_data['sequence'].apply(len)
meta_data['protein_length'].describe()

count    62478.000000
mean       298.242437
std        158.115356
min         19.000000
25%        185.000000
50%        275.000000
75%        377.000000
max       1200.000000
Name: protein_length, dtype: float64

In [12]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/solubility/solubility_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data['protein_length'] = meta_data['sequence'].apply(len)
    meta_data = meta_data.query('protein_length >= 40')
    meta_data.to_csv(f'data/benchmarks/solubility/solubility_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/solubility/solubility_{split}_data.fasta')

## Stability

In [7]:
lmdb_dir = f'data/benchmarks/stability/stability_train.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,id,sequence,protein_length,topology,parent,stability_score
0,mut_0,EEHEE_rd1_0001.pdb,GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,43,EEHEE,EEHEE_rd1_0001,0.17
1,mut_1,EEHEE_rd1_0001.pdb_hp,GSTTIEEAQNKKYQAEPRSWTKAGRTIGGKNWETEVNRAEASI,43,EEHEE,EEHEE_rd1_0001,-0.18
2,mut_10,EEHEE_rd1_0004.pdb_hp,GSKEAYETITQRTARSEDNGEEWFRERAARQLETRGYTVTREG,43,EEHEE,EEHEE_rd1_0004,0.40
3,mut_100,EEHEE_rd1_0035.pdb_hp,GSVRGRKDATREEGSSKDEGTKGAKNWWAEEWKHEAYQVRVSR,43,EEHEE,EEHEE_rd1_0035,-0.13
4,mut_1000,EEHEE_rd1_0342.pdb,GSREVRSGPTTYTFDSRDEAREVASQLASGTVETDGDRIETRG,43,EEHEE,EEHEE_rd1_0342,-0.36
...,...,...,...,...,...,...,...
53609,mut_9995,HHH_rd1_0245.pdb_random,VATALNELLKADRSENPEGGKEARILIKKRFSDNLKPLEQDAK,43,HHH,HHH_rd1_0245,-0.42
53610,mut_9996,HHH_rd1_0246.pdb,PEDKLKESAKKAWRNGNKEKAERLLEKANASDDEKKKIKKEAG,43,HHH,HHH_rd1_0246,0.06
53611,mut_9997,HHH_rd1_0246.pdb_hp,GKKEAEKKADSALKEARSKEGKDWANKIELDKEKKKNLERNPA,43,HHH,HHH_rd1_0246,-0.69
53612,mut_9998,HHH_rd1_0246.pdb_random,NKNKGEKDEKKAKDKKKPEALIGRENSALEEKARSADKALKWE,43,HHH,HHH_rd1_0246,-0.49


In [9]:
meta_data['protein_length'].describe()

count    53614.000000
mean        45.241504
std          3.265944
min         43.000000
25%         43.000000
50%         43.000000
75%         50.000000
max         50.000000
Name: protein_length, dtype: float64

In [16]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/stability/stability_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/stability/stability_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/stability/stability_{split}_data.fasta')