## Download datasets from torchDrug

In [1]:
import os
import requests
import tarfile
import pandas as pd
import lmdb 
import pickle

[link torch protein benchmarks results](https://torchprotein.ai/benchmark)

[link torch protein benchmarks datasets](https://torchdrug.ai/docs/api/datasets.html#betalactamase)

In [12]:
def download_extract_file(url, output_dir='./'):
    # read the file from the url
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        file_name = url.split("/")[-1]
        with open(output_dir + file_name, "wb") as file:
            file.write(response.raw.read())

    # Extract the tar.gz file
    with tarfile.open(output_dir+file_name, "r:gz") as tar:
        tar.extractall(path=output_dir)

    # Clean up the tar.gz file if needed
    os.remove(output_dir + file_name)

In [6]:
url1 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/beta_lactamase.tar.gz"

url2 = "http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/stability.tar.gz"

url3 = "http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/fluorescence.tar.gz"

url4 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/solubility.tar.gz"

url5 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz"

url6 = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization.tar.gz"


In [17]:
for i in range(1,7):
    print('Downloading: ',  eval(f'url{i}'))
    download_extract_file(eval(f'url{i}'), output_dir='data/benchmarks/')

Downloading:  http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/stability.tar.gz
Downloading:  http://s3.amazonaws.com/songlabdata/proteindata/data_pytorch/fluorescence.tar.gz
Downloading:  https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/solubility.tar.gz
Downloading:  https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz
Downloading:  https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization.tar.gz


In [20]:
def load_data_from_lmdb(data_path):
    df = pd.DataFrame()
    with lmdb.open(data_path, readonly=True).begin(write=False) as txn:

        cursor = txn.cursor()
        for key, value in cursor:
            data_pkl = txn.get(key)

            if data_pkl is not None:
                data = pickle.loads(data_pkl)

                if isinstance(data, dict):
                    key = key.decode('utf-8')
                    # Decode the binary data if necessary
                    for k, v in data.items():
                        if isinstance(v, bytes):
                            data[k] = v.decode('utf-8')
                    df = pd.concat([df, pd.DataFrame(data, index=[key])], axis=0)

    return df.reset_index().rename(columns={'index': 'ID', 'primary': 'sequence'})

In [3]:
def save_fasta(df, output_file):
    with open(output_file, 'w') as f:
        for i, row in df.iterrows():
            f.write(f'>mut_{row["ID"]}\n')
            f.write(f'{row["sequence"]}\n')

## Beta lactamase benchmark

In [25]:
lmdb_dir = f'data/benchmarks/beta_lactamase/beta_lactamase_valid.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,scaled_effect1
0,0,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.884452
1,1,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1.020624
2,10,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.740940
3,100,MSIQHFRVALIPFFAAFCLPVFAHPETLNKVKDAEDQLGARVGYIE...,1.055183
4,101,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.999580
...,...,...,...
515,95,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,-0.008031
516,96,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,0.927595
517,97,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1.033065
518,98,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1.003204


In [None]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/beta_lactamase/beta_lactamase_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/beta_lactamase/beta_lactamase_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/beta_lactamase/beta_lactamase_{split}_data.fasta')

## Fluorescence

In [11]:
lmdb_dir = f'data/benchmarks/fluorescence/fluorescence_valid.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,protein_length,log_fluorescence,num_mutations
0,0,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.552324,3
1,1,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.689570,3
2,10,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,1.299777,3
3,100,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,1.301031,2
4,1000,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.569029,3
...,...,...,...,...,...
5357,995,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.569963,3
5358,996,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.921040,2
5359,997,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFI...,237,3.691537,2
5360,998,SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLI...,237,3.689303,2


In [12]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/fluorescence/fluorescence_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/fluorescence/fluorescence_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/fluorescence/fluorescence_{split}_data.fasta')

# Subcellular localizations binary

In [24]:
lmdb_dir = f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_test.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,localization
0,0,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,0
1,1,AGFPEQEPEPKFWNDWAQKTLDKALSLQTLNKNKAQNLILFLGDGM...,0
2,10,MEDEAVLDRGASFLKHVCDEEEVEGHHTIYIGVHVPKSYRRRRRHK...,0
3,100,MEHHNSHLLPGGSEKMYYIAHQQPMLRNEDDNYQEGYFIRPDPASL...,0
4,1000,MARISCDLRFLLIPAAFMFIYIQMRLFQTQSQYADRLSSAIESENH...,0
...,...,...,...
1744,995,MDKTKMFSAINLGVGGIFVLSGFIKLFSFSFVNALLALFIIVFGLG...,0
1745,996,MFAPRLLDFQKTKYARFMNHRVPAHKRYQPTEYEHAANCATHAFWI...,0
1746,997,MHPALLCGPILAIFLQFLVSSCSPLENDDLFLVQVEPEVDPVVAAE...,0
1747,998,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...,0


In [9]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/subcellular_localization_2/subcellular_localization_2_{split}_data.fasta')

## Subcellular localizations

In [23]:
lmdb_dir = f'data/benchmarks/subcellular_localization/subcellular_localization_test.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,localization
0,0,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,0
1,1,AGFPEQEPEPKFWNDWAQKTLDKALSLQTLNKNKAQNLILFLGDGM...,0
2,10,MEDEAVLDRGASFLKHVCDEEEVEGHHTIYIGVHVPKSYRRRRRHK...,0
3,100,MEHHNSHLLPGGSEKMYYIAHQQPMLRNEDDNYQEGYFIRPDPASL...,0
4,1000,MATAATIPSVATATAAALGEVEDEGLLASLFRDRFPEAQWRERPDV...,3
...,...,...,...
2768,995,MGLLRIMMPPKLQLLAVVAFAVAMLFLENQIQKLEESRAKLERAIA...,3
2769,996,MEYLTNLKTNIMDKQLGHREVSEGSTQPKPDPSGATMKACVWDGPL...,3
2770,997,MAPIFRNYRFAIGAFAVIMLILLIKTSSIGPPSIARTVTPNASIPK...,3
2771,998,MVWLVAMTPRQSSLCGLAAHGLWFLGLVLLMDATARPANHSSTRER...,3


In [7]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/subcellular_localization/subcellular_localization_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/subcellular_localization/subcellular_localization_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/subcellular_localization/subcellular_localization_{split}_data.fasta')

# Solubility

In [22]:
lmdb_dir = f'data/benchmarks/solubility/solubility_valid.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,sequence,solubility
0,0,SRLYRHNLMEDVFNMENESFMQETRLMENEYSVNLPTRFYYKKRWN...,0
1,1,ATTYNAVVSKSSSDGKTFKTIADAIASAPAGSTPFVILIKNGVYNE...,1
2,10,MGHHHHHHSHMYAAGLGMLEPTKEPLKPLSAAEKIASIGQTATMSP...,0
3,100,MAHHHHHHMTKSYLNQDQQLRAQQVYTTHKHLKELSMATKEALTYH...,1
4,1000,LNKELQHERLAYITDSELIIFADFLPTEFVDEYLIDQSEESVAHIQ...,0
...,...,...,...
6937,995,EGLDYLTAPNPPSIREELCTASHDTITVHWISDDEFSISSYELQYT...,1
6938,996,MNADKFVETTISDFMRLNSSRDFEYFVSIAKQLYSRGCRDYYTEAF...,0
6939,997,SQEKVSIEQQLAVESIRKFLNSKTSYDVLPVSYRLIVLDTSLLVKK...,1
6940,998,MAHHHHHHMALISQSFAERYAELVARVPTHTELLLDVQRREVGERK...,1


In [14]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/solubility/solubility_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/solubility/solubility_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/solubility/solubility_{split}_data.fasta')

## Stability

In [26]:
lmdb_dir = f'data/benchmarks/stability/stability_valid.lmdb'
meta_data = load_data_from_lmdb(lmdb_dir)
meta_data

Unnamed: 0,ID,id,sequence,protein_length,topology,parent,stability_score
0,0,EEHEE_rd4_0008.pdb,ITVDVNGVTYHFDNPEEAYKFVVRVARELNLTYEWHGNKVRVELES...,50,EEHEE,EEHEE_rd4_0008,1.45
1,1,EEHEE_rd4_0008.pdb_PG_hp,VEWEFNGVEVNVKHPETAFEYALDVVNKVRLHVRITGRTYNYDLES...,50,EEHEE,EEHEE_rd4_0008,-0.02
2,10,EEHEE_rd4_0038.pdb_PG_hp,HSYQLNGAQVDFNHPTKLIEVAFEAIDTMKFEVKVDGHRVKLTLES...,50,EEHEE,EEHEE_rd4_0038,-0.01
3,100,EEHEE_rd4_0153.pdb,FTVHMGNVTYHFSSPEEALRFALRMAKELGLQVEVHGETMKVKLES...,50,EEHEE,EEHEE_rd4_0153,1.73
4,1000,EHEE_rd4_0659.pdb,GSSAITIELEGEEQAKEVKKEAEKRNLEAEIQVHNGKWRVTLRLES...,50,EHEE,EHEE_rd4_0659,0.78
...,...,...,...,...,...,...,...
2507,995,EHEE_rd4_0645.pdb_PG_hp,GSSKAKVKIKGQEHVELFMEDVERDNIKAHADARGNEYVVRLDLES...,50,EHEE,EHEE_rd4_0645,0.55
2508,996,EHEE_rd4_0645.pdb_buryD,GSSRFRYEVHGKEQADAIAKEMKDRNLNVEVKVHGDKIVDDAELES...,50,EHEE,EHEE_rd4_0645,-0.20
2509,997,EHEE_rd4_0656.pdb,GSSSIHIETHGPEAADKIRKMMAKRNLEVHVHVHGNKVVIQIDLES...,50,EHEE,EHEE_rd4_0656,0.03
2510,998,EHEE_rd4_0656.pdb_PG_hp,GSSHVEINRHGPHAVQKMDKIAVNKSLHVDIHMEGERAVIKITLES...,50,EHEE,EHEE_rd4_0656,-0.80


In [27]:
for split in ['test', 'train', 'valid']:
    lmdb_dir = f'data/benchmarks/stability/stability_{split}.lmdb'
    meta_data = load_data_from_lmdb(lmdb_dir)
    meta_data.to_csv(f'data/benchmarks/stability/stability_{split}_metadata.csv', index=False)
    save_fasta(meta_data, f'data/benchmarks/stability/stability_{split}_data.fasta')