In [1]:
%load_ext autoreload
%autoreload 2

from utils.prediction_models import *

In [2]:
DIR_data="/home/zeqianli/project/zeqian/Carbon/data"
zeqian=pd.read_pickle(f"{DIR_data}/zeqian/zeqian.pk")
matti=pd.read_pickle(f"{DIR_data}/matti/matti.pk")
bacdive=pd.read_pickle(f"{DIR_data}/bacdive/bacdive.pk")
datasets={"zeqian":zeqian,"matti":matti,"bacdive":bacdive,}

# bacdive_sim=pd.read_pickle("/home/zeqianli/project/zeqianli/Kuehn/Carbon/data/20221031_bacdive_simulated_data.pk")

## 3A: models 

Left: Null, FBA, NN-genomes, NN-RF, NN-OOC, feature selection

Right: FI

In [3]:
threads=48

p=Pool(threads)

In [4]:
DIR_figure=f"{DIR_data}/figure_data/3A"


models={'null': [BernoulliNull,
                {'model_params':{},
                 'split_method':'random',
                 'splitter_params': {'test_set_ratio':0.2}, 
                 'n_splits':100,
                 'threads': threads,
                 'save_models':False
                 }],
        'null_ooc': [BernoulliNull,
                {'model_params':{},
                 'split_method':'ooc',
                 'splitter_params': {'test_set_range':(0.1,0.3), 'min_zeros':0, 'min_ones':0, 'time_out_iter':None},
                 'n_splits':100,
                 'threads': threads,
                 'save_models':False
                 }],
                
        'NN':[NearestNeighbor,
                {'model_params':{'n_neighbors':1,'metric':'hamming'},
                'split_method':'random',
                 'splitter_params': {'test_set_ratio':0.2}, 
                 'n_splits':100,
                 'threads': threads,
                 'save_models':False
                 }
              ],
        'NN_ooc':[NearestNeighbor,
                {'model_params':{'n_neighbors':1,'metric':'hamming'},
                'split_method':'ooc',
                'splitter_params': {'test_set_range':(0.1,0.3), 'min_zeros':0, 'min_ones':0, 'time_out_iter':None},
                'n_splits':100,
                'threads': threads,
                'save_models':False
                }
                ],
        'RF':[RF,
                {'model_params':{'n_estimators':100,'max_depth':None, 'max_features': 'sqrt'},
                 'split_method':'random',
                 'splitter_params': {'test_set_ratio':0.2}, 
                 'n_splits':100,
                 'threads': threads,
                 'save_models':False
                },
        ],
        'RF_ooc':[RF,
                {'model_params':{'n_estimators':100,'max_depth':None, 'max_features': 'sqrt'},
                'split_method':'ooc',   
                'splitter_params': {'test_set_range':(0.1,0.3), 'min_zeros':0, 'min_ones':0, 'time_out_iter':None},
                'n_splits':100,
                'threads': threads,
                'save_models':False
                }],
        'GreedyFeatureSelection':[GreedyFeatureSelection,
                {'model_params':{
                    "Model":DecisionTree, 
                    'tree':zeqian['tree'],
                    "model_params":{},
                    "n_max_features":3, 
                    "n_feature_subsample":None,
                    "n_meta_split":10, 
                    "split_method":'ooc',
                    "splitter_params":{'test_set_range':(0.1,0.3),'min_zeros':0,'min_ones':0,'time_out_iter':None},
                    "threads":threads,                    
                    "save_meta_models":False,
                    'verbose':False,
                    'p':p,},
                'split_method':'ooc',
                'splitter_params': {'test_set_range':(0.1,0.3), 'min_zeros':0, 'min_ones':0, 'time_out_iter':None},
                'n_splits':10,
                'threads':1,
                'save_models':False,
                }]
}

matrices={
          'zeqian_genome': zeqian
        }


In [5]:
for model_name, (Model, pipe_params) in models.items():
    for dataset_name,dataset in matrices.items():
        try:
            print(f"Running {dataset_name} {model_name}")
            ff_cache=os.path.join(DIR_figure,'cache',f'{dataset_name}_{model_name}.pk')
            ff_results=os.path.join(DIR_figure,'results',f'{dataset_name}_{model_name}.pk')
            if os.path.exists(ff_results):
                print("Already exists. Skipping. ")
                continue
            ko_data,growth_data, tree,carbons=dataset['ko_data'],dataset['growth_data'],dataset['tree'],dataset['carbons']
            pipe=PredictionPipeline(Model, tree=tree, carbons=carbons, p=p,  ff_results=ff_cache, **pipe_params)
            pipe.generate_splits(ko_data,growth_data)
            results=pipe.run()
            with open(ff_results, 'wb') as f:
                pickle.dump(results, f)
            print(f"Finished {dataset_name} {model_name} ")
        except Exception as e: 
            print(e)
            print(f"Failed to run {dataset_name} {model_name}")


Running zeqian_genome null
Already exists. Skipping. 
Running zeqian_genome null_ooc
Already exists. Skipping. 
Running zeqian_genome NN
Already exists. Skipping. 
Running zeqian_genome NN_ooc
Already exists. Skipping. 
Running zeqian_genome RF
Already exists. Skipping. 
Running zeqian_genome RF_ooc
Already exists. Skipping. 
Running zeqian_genome GreedyFeatureSelection
Changed to /home/zeqianli/project/zeqian/Carbon/data/figure_data/3A/cache/zeqian_genome_GreedyFeatureSelection_2.pk. 


Generating splits...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 554.41it/s]
Training models...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [8:11:39<00:00, 294.99s/it]


Finished zeqian_genome GreedyFeatureSelection 
