In [1]:
from sklearn.model_selection import StratifiedKFold
import itertools
import os,sys
sys.path.append(os.path.realpath('../lib'))
import numpy as np
import ast
from utils import split_train_test,unique_repr,create_log_dir
from sklearn.metrics import accuracy_score
from custom_svc import Graph_RJW_SVC_Classifier
import time 
import utils 
import argparse
import random
from data_loader import load_local_data
from IPython.core.debugger import Tracer

In [2]:
class AlphaMustBeDefinedError(Exception):
    pass

def explode_tuned_parameters(tuned_params):
    tuned_parameters2=tuned_params[0]
    varNames = sorted(tuned_parameters2)
    combinations = [dict(zip(varNames, prod)) for prod in 
                    itertools.product(*(tuned_parameters2[varName] for varName in varNames))]
    return combinations

In [3]:
def filter_dict(old_dict,your_keys):
    return { your_key: old_dict[your_key] for your_key in your_keys }
    
def filter_all_params(allparams,filtre_key):
    filtered_all_params=[]
    filtre=set(filtre_key).intersection(set(allparams[0].keys()))
    for param in allparams:
        filter_param=filter_dict(param,filtre)
        if filter_param not in filtered_all_params:
            filtered_all_params.append(filter_param)
    return filtered_all_params

In [4]:
def nested_rjw(X,y,tuned_parameters,dataset_name,dict_index,logging,
               path=None,n_inner=10,n_iter=10,verbose=1,optionnal=""):

    """ Compute the nested cross-validation         
        Parameters
        ----------
        X : array of Graph objects
        y : array of classes of each graph
        tuned_parameters : a list of dict 
                           Parameters to cross validate.
        dataset_name : string 
                       name of the dataset. Used only to check the right dataset in the precalculated disances folder
        logging : a logging object
                  Used to write the log. Can be instantiate via utils.setup_logger
        path : string
               Path to the precalculated RJW distances. If not specified all distances are recalculated. 
               If specified it checks amoung all precalculated distances the ones that correspond to the cross-validated parameters
        n_inner : integer
                  The number of inner folds in the nested cross validation
        n_iter : integer
                 The number of outer folds in the nested cross validation
        optionnal : string
                    A optionnal name to add to the name of the log file
        Returns
        -------
        Writes the results in the log file
    """

    logging.info('############ Begin nested CV ############')
    logging.info('Inner : '+str(n_inner))
    logging.info('Outer : '+str(n_iter))
    logging.info('params : '+str(tuned_parameters))
    
    X=np.array(X)
    y=np.array(y)
    
    outer_score=[]
    allparams=explode_tuned_parameters(tuned_parameters)
    
    filtre=set(Graph_RJW_SVC_Classifier().get_distances_params().keys())
    all_params_filtered=filter_all_params(allparams,filtre)
    index=dict_index[dataset_name]
    
    logging.info('Begin precomputing all distances matrices')
    logging.info(str(len(all_params_filtered))+' matrices to fit...')
    # Get the distances of calculates them
    dict_of_all_distances={}
    l=0
    for params in all_params_filtered:
        clf=Graph_RJW_SVC_Classifier(**params)
        if path is None:
            if verbose>1:
                print('Path is None : we precalculate distances but we are not saving them')
            clf.compute_all_distance(np.array(X),np.array(X))
            dict_of_all_distances[unique_repr(clf.get_distances_params())]=clf.D
        else:
            if optionnal!="":
                name=dataset_name+optionnal
            else:
                name=dataset_name
            if name+'.pkl' in os.listdir(path):
                if verbose>1:
                    print('Load dict')
                d=utils.load_obj(name+'.pkl',path=path)
            else:
                if verbose>1:
                    print('Create empty dict')
                d={}
            if unique_repr(clf.get_distances_params()) in d:
                D=d[unique_repr(clf.get_distances_params())]
                dict_of_all_distances[unique_repr(clf.get_distances_params())]=D/np.max(D)
            else:
                if verbose >1:
                    print('Recalculate distance')
                clf.compute_all_distance(np.array(X),np.array(X))
                dict_of_all_distances[unique_repr(clf.get_distances_params())]=clf.D/np.max(clf.D)
                d[unique_repr(clf.get_distances_params())]=clf.D
                utils.save_obj(d,name,path=path)
            logging.info('One distance done')                     
        l+=1
        if l%10==0 and verbose>1:
            print('Done params : ',l)
    logging.info('...Done')
            
    for i in range(n_iter):
        k_fold=StratifiedKFold(n_splits=n_inner,random_state=i,shuffle=True)
        G_train,y_train,idx_train,G_test,y_test,idx_test=split_train_test(list(zip(X, list(y))),ratio=0.9,seed=i,
                                                                          index=index)    
        acc_inner_dict={} 
        best_inner_dict={}
        for param in allparams:
            acc_inner_dict[repr(param)]=[]    
            
        for idx_subtrain, idx_valid in k_fold.split(G_train,y_train):
            true_idx_subtrain=[idx_train[i] for i in idx_subtrain]
            true_idx_valid=[idx_train[i] for i in idx_valid]

            x_subtrain = np.array([X[i] for i in true_idx_subtrain])
            y_subtrain = np.array([y[i] for i in true_idx_subtrain])
            x_valid=np.array([X[i] for i in true_idx_valid])
            y_valid=np.array([y[i] for i in true_idx_valid])
                      
            # For all parameter fit on subtrain and test on subtest    
            for param in allparams:
                # Initialise an SVM and fit.
                clf = Graph_RJW_SVC_Classifier()
                clf.set_params(**param)
                                
                # Fit on the train Kernel                                  
                if unique_repr(clf.get_distances_params()) in dict_of_all_distances:
                    
                    if verbose>2:
                        print('--------------------------------------------------------')
                        print('Params all : ', str(unique_repr(clf.get_params())))  
                        print('Distance pram : ', str(unique_repr(clf.get_distances_params())))
                    
                    D=dict_of_all_distances[unique_repr(clf.get_distances_params())]
                    st=time.time() 
                    clf.fit(x_subtrain,y_subtrain,matrix=D[np.ix_(true_idx_subtrain,true_idx_subtrain)])
                        
                    # Predict and test.
                    y_pred = clf.predict(x_valid,matrix=D[np.ix_(true_idx_valid,true_idx_subtrain)])
                    ed=time.time()
                    
                    # Calculate accuracy of classification.
                    ac_score=accuracy_score(y_valid.reshape(-1,1), y_pred.reshape(-1,1))
                    if verbose>2:
                        print('Done in : ',ed-st)
                        print('--------------------------------------------------------')
                    acc_inner_dict[repr(param)].append(ac_score)
              
        # Find best params in the inner
        for key,value in acc_inner_dict.items():
            best_inner_dict[key]=np.mean(acc_inner_dict[key])
                
        param_best=ast.literal_eval(max(best_inner_dict,key=best_inner_dict.get))
        logging.info('Best params : '+str(repr(param_best)))

        clf = Graph_RJW_SVC_Classifier()
        clf.set_params(**param_best)  
        
        D=dict_of_all_distances[unique_repr(clf.get_distances_params())]
                    
        clf.fit(G_train, y_train,matrix=D[np.ix_(idx_train,idx_train)])
        
        y_pred = clf.predict(G_test,matrix=D[np.ix_(idx_test,idx_train)])
        
        ac_score_outer=accuracy_score(y_test.reshape(-1,1), y_pred.reshape(-1,1))
        outer_score.append(ac_score_outer)

        logging.info('Accuracy '+str(ac_score_outer*100))
        logging.info('############ One outer Done ############')
              
    logging.info('Nested mean score '+str(np.mean(outer_score)*100))
    logging.info('Nested std score '+str(np.std(outer_score)*100))

In [5]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Nested CV for rjw')
    parser.add_argument('-dn','--dataset_name', type=str,help='the name of the dataset',
                        choices=['mutag','ptc','enzymes','protein','cox2','bzr','nci1',
                                 'nci109','dd','collab','bzr_md','cox2_md'],
                        default='mutag')
    parser.add_argument('-d','--data_path',type=str,help='the path to the data',
                        default='../data/')
    parser.add_argument('-ni','--n_inner', default=10, type=int,
                        help='the number of folds in the inner cv')
    parser.add_argument('-no','--n_outer', default=10, type=int,
                        help='the number of folds in the outer cv')
    parser.add_argument('-dist','--distances_path',default='../distances/',
                        help='the path to the precalculated distances for rjw')
    parser.add_argument('-o','--optionnal_name',default="rjw",
                        help='optionnal name to add for the log file')
    parser.add_argument('-r','--log_dir',default='../log/', type=str,
                        help='the path to the directory where to write to')
    
    #change wl parameter for each dataset [wl=2-discrete attributes, wl=0-continuous attributes]
    #default=2->MUTAG,PTC-MR,NCI1,NCI109,D&D AND COLLAB
    #default=0->ENZYMES,PROTEINS,COX2,BZR,BZR-MD AND COX2-MD 
    parser.add_argument('-wl','--wl_feature',type=int,
                        help='Use the Weisfeler Lehman features if wl>0 ',default=2) 
    
    parser.add_argument('-at','--attributes',
                        help='wether to use continuous attributes of the graph',type=utils.str2bool,default=True)
    parser.add_argument('-fea','--feature_metric',type=str,
                        choices=['euclidean','sqeuclidean','dirac','hamming_dist'],
                        default='hamming_dist',
                        help='the metric to use for the features')
    parser.add_argument('-st','--structure_metric',type=str,
                        choices=['random_walk','adjency'],help='the metric to use for the structures',
                        default='random_walk')
    parser.add_argument('-C','--Csvm',default=10, type=float,
                        help='C parameter in Linear SVM. If not specified cross validated')
    parser.add_argument('-g','--gamma',default=1, type=float,
                        help='Gamma parameter in Gaussian SVM. If not specified cross validated')
    parser.add_argument('-v','--verbose', default=1, type=int,help='verbose')
    parser.add_argument('-am','--amijo',help='whether to use amijo linesearch',
                        type=utils.str2bool,default=True)
    
    #change alpha for each dataset.
    #default=[1]->MUTAG, BZR AND COX2
    #default=[0.9]->ENZYMES AND PTC-MR
    #default=[0.6]->PROTEINS
    parser.add_argument('-a','--alpha',type=float, 
                        help='Alphas to cross validate. Ignored if cva is true',default=[1])
    
    parser.add_argument('-b','--beta',type=float, 
                        help='Betas to cross validate. Ignored if cva is true',default=[1e-4])
    
    parser.add_argument('-i','--index',type=list, 
                        help='indexing datasets.',default={'mutag':0,'ptc':1,'cox2':2,
                                                           'bzr':3,'enzymes':4,'protein':5,
                                                           'nci1':6,'nci109':7,'dd':8,'collab':9,
                                                           'bzr_md':10,'cox2_md':11})
    
    parser.add_argument('-cva','--automatic_cv_alpha',
                        help='wether to use a predifined CV grid for alpha.',
                        type=utils.str2bool,default=False)

    
    args = parser.parse_args(args=[])
    data_path=args.data_path
    
    if args.alpha==-8000 and not args.automatic_cv_alpha:
        raise AlphaMustBeDefinedError('You must set alpha via -a')
    
    
    name='rjw'+'_'+args.dataset_name+'_feature_metric_'+args.feature_metric+'_structure_metric_'+args.structure_metric
    if args.wl_feature>0:
        name=name+'_wl_'+str(args.wl_feature)
    name=name+args.optionnal_name
    
    try:
        if not os.path.exists(args.log_dir):
            os.makedirs(args.log_dir)
    except OSError:
        raise

    log_dir=create_log_dir(args)
  
    Clist=[args.Csvm]
    alpha_list=args.alpha
    beta_list=args.beta
    gamma_list=[args.gamma]
    
    logger = utils.setup_logger('outer_logger', log_dir+'/'+name+'_outer.log')
    logger.info('Let the Outer CV Begin for '+str(name))
    logger.info('n_outer : '+str(args.n_outer))
    logger.info('n_inner : '+str(args.n_inner)) 
      
    X,y=load_local_data(data_path,args.dataset_name,attributes=args.attributes,wl=args.wl_feature)

    tuned_parameters = [{'alpha':alpha_list,'C':Clist,'gamma':gamma_list,'beta':beta_list,
                         'features_metric':[args.feature_metric],
                         'method':[args.structure_metric],'wl':[args.wl_feature],'amijo':[args.amijo]}]
    
    nested_rjw(X,y
        ,tuned_parameters
        ,args.dataset_name
        ,args.index
        ,logger
        ,args.distances_path
        ,n_inner=args.n_inner
        ,n_iter=args.n_outer
        ,verbose=args.verbose
        ,optionnal=str(args.optionnal_name))

../log//protein_2021_10_07_19_33_10
