In [1]:
from ivbase.utils.datasets.datacache import DataCache
import os
import tarfile
import pickle
import pandas as pd
import re
import numpy as np

### Retrieve paths to aws outputs and store locally

In [3]:
diff_cache1 = DataCache(cache_root='../diff')
diff_dir1 = diff_cache1.get_dir("s3://experiments-output/diff")

topk_cache1 = DataCache(cache_root='../topk')
topk_dir1 = topk_cache1.get_dir("s3://experiments-output/topk")

gcn_cache1 = DataCache(cache_root='../gcn')
gcn_dir1 = gcn_cache1.get_dir("s3://experiments-output/gcn")

download: s3://experiments-output/diff/7d087aba1c754cdeddc41cf18863fc0ee80971dd.zip to ../diff/experiments-output/diff/7d087aba1c754cdeddc41cf18863fc0ee80971dd.zip
download: s3://experiments-output/diff/b3a766327dcb1d334a5f9bc8862520f8d030983a.zip to ../diff/experiments-output/diff/b3a766327dcb1d334a5f9bc8862520f8d030983a.zip
download: s3://experiments-output/diff/384def1d1eb3cc61e1e699489028963e31d2735c.zip to ../diff/experiments-output/diff/384def1d1eb3cc61e1e699489028963e31d2735c.zip
download: s3://experiments-output/diff/bd61d6666b0f27c58a66dc447ef9789d5871bc49.zip to ../diff/experiments-output/diff/bd61d6666b0f27c58a66dc447ef9789d5871bc49.zip
download: s3://experiments-output/diff/1d11940ca20f0827ecdccc07c12a9f837188d9ce.zip to ../diff/experiments-output/diff/1d11940ca20f0827ecdccc07c12a9f837188d9ce.zip
download: s3://experiments-output/diff/2fce2ab832a2a97718368dcaa23bb0cfdb393d46.zip to ../diff/experiments-output/diff/2fce2ab832a2a97718368dcaa23bb0cfdb393d46.zip
download: s3://e

In [4]:
print(len(os.listdir(diff_dir1)))
print(len(os.listdir(topk_dir1)))

13
9


### Exctract files from unprocessed directories

In [21]:
import os
import glob
def extract_result_files(results_dir):
    for d in glob.glob(os.path.join(results_dir, "*zip")) :
        path = d.replace(".zip", "")
        if not os.path.isdir(path):
            cmd = "unzip {} -d {}/".format(d, path)
            os.system(cmd)


In [23]:
extract_result_files(topk_dir1)
extract_result_files(gcn_dir1)
extract_result_files(diff_dir1)
#extract_result_files(results_dir2)

### Collect results

In [94]:
DATASET_MAPPING = {
    'data/dataset_fragments.txt': 'tox_fragments',
    'data/dataset_functional.txt' : 'tox_functional',
    'data/dataset_alert.txt' : 'tox_alerts',
    "data/standard_data/tox21/tox21.csv" : 'tox21',
    'data/chembl_dataset_fragments.txt': 'chembl_fragments',
    'data/chembl_dataset_functional.txt' : 'chembl_functional',
    'data/chembl_dataset_alert.txt' : 'chembl_alerts'  
}

ALGO_MAPPING = {
    'laplacian' : 'Laplacian',
    'diff' : 'Diffpool',
    'topk' : 'Topk',
    "gcn": 'GIN'
}

def get_params(config, algo):
    if algo == 'Laplacian':
        reg = config['pool_arch']['reg_mode']
        hop = config['pool_arch']['hop']
        lap_hop = config['pool_arch']['lap_hop']
        return str(reg)+str(hop)+str(lap_hop)
        
    elif algo == 'Diffpool' or algo == 'Topk':
        return config['pool_arch']['hidden_dim']
    else:
        return ""

def get_results_key(config):
    dataset = config['data']
    try:
        algo = ALGO_MAPPING[config['pool_arch']['arch']]
    except KeyError as e:
        algo = 'gcn'
    params = get_params(config, algo)
    return (dataset,algo,params)


def get_results_dict(results_dir, return_paths = True):
    results_dict = {}
    path_dict = {}
    for d in os.listdir(results_dir):
        path = os.path.join(results_dir, d)
        if os.path.isdir(path):
            config_file = glob.glob(os.path.join(path, "config*pkl"))[0]
            with open(config_file, 'rb') as f:
                r = pickle.load(f)
                key = get_results_key(r) 
            
            data_file = glob.glob(os.path.join(path, "data*pkl"))[0]
            with open(data_file, 'rb') as f:
                r = pickle.load(f)
            if key not in results_dict:
                results_dict[key] = r
            else:
                print(results_dict[key])
            if return_paths:
                path_dict[key] = os.path.join(results_dir,d)
    if return_paths:
        return results_dict, path_dict
    return results_dict

### Format results into DataFrame

In [95]:
def populate_df(df, results_dict):
    locations = [(ix,val) for ix in df.index for val in df.columns]
    for (ix,val) in locations:
        df.loc[ix, val] = results_dict[ix][0][val][0]
    return df

In [96]:
def get_results_df(results_dict):
    index = pd.MultiIndex.from_tuples(results_dict.keys())
    results = pd.DataFrame(index = index, columns = ['acc', 'f1_micro', 'f1_macro', 'roc'], dtype=np.float64)
    print(results_dict)
    results = populate_df(results,results_dict)
    # Get combined score as geometric mean over column metrics of primary interest (not accuracy)
    results['overall'] = results.apply(lambda x : x[1:].prod()**(1/3), axis = 1)
    return results

In [97]:
topk_dict1, topk_path = get_results_dict(topk_dir1)
gcn_dict1, gcn_path = get_results_dict(gcn_dir1)
diff_dict1, diff_path = get_results_dict(diff_dir1)

#results_dict2, path_dict2 = get_results_dict(results_dir2)
#laplacian_results = get_results_df(results_dict1)
topk_results = get_results_df(topk_dict1)
diff_results = get_results_df(diff_dict1)
gcn_results = get_results_df(gcn_dict1)

{'pool_arch': {'arch': 'topk', 'hidden_dim': 10}, 'pool_loss': False, 'data': 'DD'}
{'pool_arch': {'arch': 'topk', 'hidden_dim': 10}, 'pool_loss': False, 'data': 'ENZYMES'}
{'pool_arch': {'arch': 'topk', 'hidden_dim': 10}, 'pool_loss': False, 'data': 'ENZYMES'}
{'pool_arch': {'arch': 'topk', 'hidden_dim': 10}, 'pool_loss': False, 'data': 'FRANKENSTEIN'}
{'pool_arch': {'arch': 'topk', 'hidden_dim': 10}, 'pool_loss': False, 'data': 'PROTEINS'}
[{'name': 'gcn', 'repeats': 5, 'acc': [0.7175141242937854, 0.7796610169491526, 0.7401129943502824, 0.7457627118644068, 0.7796610169491526], 'roc': [0.8924897119341564, 0.8914385399892646, 0.828686816050026, 0.794640522875817, 0.8696929238985314], 'f1_micro': [0.7175141242937854, 0.7796610169491526, 0.7401129943502824, 0.7457627118644068, 0.7796610169491526], 'f1_macro': [0.679440741813967, 0.7531557303772574, 0.7318888303477344, 0.7410008779631255, 0.7637010919795981], 'f1_weighted': [0.6888030489811353, 0.7709782506583592, 0.73852122067237, 0.7463

KeyError: 0

In [190]:
laplacian_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,acc,f1_micro,f1_macro,roc,overall
tox_fragments,Laplacian,0-11,0.976561,0.845765,0.798066,0.939716,0.859202
tox_fragments,Laplacian,1-11,0.977629,0.852674,0.807830,0.946595,0.867140
tox_fragments,Laplacian,2-11,0.976883,0.848305,0.805258,0.943665,0.863843
tox_fragments,Laplacian,0-12,0.975984,0.842125,0.811699,0.940644,0.863109
tox_fragments,Laplacian,1-12,0.976583,0.845456,0.794890,0.942977,0.858947
tox_fragments,Laplacian,2-12,0.977102,0.850060,0.807027,0.947852,0.866349
tox_fragments,Laplacian,0-13,0.974742,0.832947,0.797435,0.937340,0.853893
tox_fragments,Laplacian,1-13,0.977658,0.854004,0.806965,0.942983,0.866176
tox_fragments,Laplacian,2-13,0.973631,0.823863,0.786002,0.933836,0.845635
tox_fragments,Laplacian,001,0.974435,0.829615,0.783307,0.941144,0.848832


In [191]:
baselines_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,acc,f1_micro,f1_macro,roc,overall
tox_alerts,gcn,,0.998284,0.683438,0.788995,0.726423,0.731680
tox_alerts,Diffpool,3,0.998000,0.634656,0.777689,0.747806,0.717317
tox_alerts,Diffpool,5,0.998188,0.650334,0.776179,0.746057,0.722144
tox_alerts,Diffpool,7,0.998548,0.727273,0.804851,0.745758,0.758584
tox_alerts,Diffpool,9,0.998188,0.621984,0.776421,0.719514,0.703027
tox_alerts,Diffpool,4,0.997901,0.577367,0.761373,0.737356,0.686924
tox_alerts,Topk,3,0.996869,0.278947,0.715684,0.675943,0.512922
tox_alerts,Topk,4,0.997155,0.360825,0.719848,0.634107,0.548151
tox_alerts,Topk,5,0.997518,0.432718,0.718069,0.656927,0.588793
tox_alerts,Topk,7,0.997738,0.476190,0.711145,0.678529,0.612495


In [192]:
final_results = pd.concat([laplacian_results, baselines_results])
final_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,acc,f1_micro,f1_macro,roc,overall
tox_fragments,Laplacian,0-11,0.976561,0.845765,0.798066,0.939716,0.859202
tox_fragments,Laplacian,1-11,0.977629,0.852674,0.807830,0.946595,0.867140
tox_fragments,Laplacian,2-11,0.976883,0.848305,0.805258,0.943665,0.863843
tox_fragments,Laplacian,0-12,0.975984,0.842125,0.811699,0.940644,0.863109
tox_fragments,Laplacian,1-12,0.976583,0.845456,0.794890,0.942977,0.858947
tox_fragments,Laplacian,2-12,0.977102,0.850060,0.807027,0.947852,0.866349
tox_fragments,Laplacian,0-13,0.974742,0.832947,0.797435,0.937340,0.853893
tox_fragments,Laplacian,1-13,0.977658,0.854004,0.806965,0.942983,0.866176
tox_fragments,Laplacian,2-13,0.973631,0.823863,0.786002,0.933836,0.845635
tox_fragments,Laplacian,001,0.974435,0.829615,0.783307,0.941144,0.848832


In [97]:
final_results.loc['chembl_functional']

Unnamed: 0,Unnamed: 1,acc,f1_micro,f1_macro,roc,overall
Laplacian,0-11,0.99848,0.986618,0.957586,0.866896,0.935617
Laplacian,1-11,0.99868,0.988373,0.951233,0.860071,0.931638
Laplacian,2-11,0.998702,0.988577,0.960127,0.867123,0.937145
Laplacian,0-12,0.998348,0.98547,0.9461,0.861206,0.929457
Laplacian,1-12,0.998657,0.988185,0.94937,0.868756,0.934094
Laplacian,2-12,0.998621,0.987869,0.947387,0.861647,0.930791
Laplacian,0-13,0.998517,0.986954,0.949014,0.866561,0.932802
Laplacian,1-13,0.998525,0.987015,0.955161,0.868681,0.935593
Laplacian,2-13,0.998385,0.985804,0.939353,0.868181,0.929844
Laplacian,001,0.998672,0.988312,0.968476,0.867008,0.939727


### Obtain best results for each algo for each dataset

In [239]:
indices = []
for d in DATASET_MAPPING.values():
    for a in ALGO_MAPPING.values():
        ix = final_results.loc[(d,a),'f1_macro'].idxmax(axis=0)
        indices.append((d, a, ix))

final_best_results = final_results.loc[indices]

Hyper-parameters go as regularization mode, hop and lap_hop for the Laplacian algorithm and represents the hidden dimension for the baseline algorithms.

In [246]:
final_best_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,acc,f1_micro,f1_macro,roc,overall
tox_fragments,Laplacian,0-12,0.975984,0.842125,0.811699,0.940644,0.863109
tox_fragments,Diffpool,3,0.967996,0.786421,0.7736,0.935375,0.828678
tox_fragments,Topk,9,0.957712,0.723036,0.721351,0.882684,0.772154
tox_fragments,gcn,,0.975015,0.83487,0.795873,0.945414,0.856436
tox_functional,Laplacian,233,0.993956,0.946206,0.890445,0.907122,0.914295
tox_functional,Diffpool,3,0.987155,0.88471,0.851458,0.889732,0.875134
tox_functional,Topk,9,0.972166,0.730526,0.727361,0.783538,0.746704
tox_functional,gcn,,0.992869,0.935833,0.868604,0.904935,0.902705
tox_alerts,Laplacian,2-12,0.998617,0.754564,0.814177,0.739935,0.768899
tox_alerts,Diffpool,7,0.998548,0.727273,0.804851,0.745758,0.758584


In [129]:
log_path = path_dict1[('chembl_functional','Laplacian', '003')]

In [130]:
log_path = os.path.join(log_path, 'output', '.logs')
log_path

'../final_caches_laplacian/invivoai-sagemaker-artifacts/molg/sup/laplacian/laplacian-molg-2019-05-17-08-04-23-407/output/.logs'

In [189]:
regs = ['0','1','2']
hops = ['0','-1','3']
lap_hops = ['1','2','3']
params =[r+h+l for r in regs for h in hops for l in lap_hops]
datasets = DATASET_MAPPING.values()
algos = ['Laplacian']
total_keys = [(d,a,p) for d in datasets for a in algos for p in params]