In [8]:
%matplotlib inline
from __future__ import absolute_import, division, print_function, unicode_literals
from builtins import range

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style("white")

from functools import partial
from joblib import delayed, Parallel
from mmit import MaxMarginIntervalTree
from mmit.core.solver import compute_optimal_costs
from mmit.metrics import mean_squared_error, zero_one_loss
from mmit.model_selection import GridSearchCV
from os import listdir, mkdir
from os.path import abspath, basename, exists, join
from shutil import rmtree as rmdir
from time import time

import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)

In [3]:
class Dataset(object):
    def __init__(self, path):
        self.path = path
        feature_data = pd.read_csv(join(path, "features.csv"))
        self.X = feature_data.values
        self.feature_names = feature_data.columns.values
        del feature_data
        self.y = pd.read_csv(join(path, "targets.csv")).values
        self.folds = pd.read_csv(join(path, "folds.csv")).values.reshape(-1, )
        self.name = basename(path)
    
    @property
    def n_examples(self):
        return self.X.shape[0]
    
    @property
    def n_features(self):
        return self.X.shape[1]
    
def find_datasets(path):
    for d in listdir(path):
        if exists(join(path, d, "features.csv")) and \
           exists(join(path, d, "targets.csv")) and \
           exists(join(path, d, "folds.csv")):
            yield Dataset(abspath(join(path, d)))

datasets = list(find_datasets("./data"))

In [4]:
def evaluate_on_dataset(d, metric, result_dir):
    start_time = time()
    
    ds_result_dir = join(result_dir, d.name)
    if not exists(ds_result_dir):
        mkdir(ds_result_dir)
    
    fold_predictions = np.zeros(d.n_examples)
    for fold in np.unique(d.folds):
        fold_train = d.folds != fold
        
        X_train = d.X[fold_train]
        y_train = d.y[fold_train]
        X_test = d.X[~fold_train]
        y_test = d.y[~fold_train]
        
        cv = GridSearchCV(estimator=MaxMarginIntervalTree(), param_grid=params, cv=10, n_jobs=-1, 
                          scoring=metric)
        cv.fit(X_train, y_train)
        fold_predictions[~fold_train] = cv.predict(X_test)
    print("MSE:", mean_squared_error(d.y, fold_predictions))
    print("ACC:", 1.0 - zero_one_loss(d.y, fold_predictions))
    open(join(ds_result_dir, "predictions.csv"), "w").write("\n".join(str(x) for x in fold_predictions))
    print("Took", time() - start_time, "seconds.")

In [10]:
params = {"max_depth": [10],
          "min_samples_split": [0],
          "margin": np.logspace(-3, 0, 10)}

def prep_result_dir(result_dir):
    if exists(result_dir):
       rmdir(result_dir)
    mkdir(result_dir)

def mse_metric(estimator, X, y):
    return mean_squared_error(y_pred=estimator.predict(X), y_true=y)

params["loss"] = ["hinge"]
result_dir = "mmit.linear.hinge"
prep_result_dir(result_dir)
for d in datasets:
    print(d.name)
    evaluate_on_dataset(d, mse_metric, result_dir)
    print()
    
params["loss"] = ["squared_hinge"]
result_dir = "mmit.squared.hinge"
prep_result_dir(result_dir)
for d in datasets:
    print(d.name)
    evaluate_on_dataset(d, mse_metric, result_dir)
    print()

H3K27ac-H3K4me3_TDHAM_BP_FPOP
MSE: 0.735491692741
ACC: 0.804278074866
Took 344.886688948 seconds.

H3K27ac-H3K4me3_TDHAM_BP_joint
MSE: 0.312839561755
ACC: 0.935018050542
Took 50.6701509953 seconds.

H3K36me3_AM_immune_FPOP
MSE: 0.152840906699
ACC: 0.85
Took 111.722827911 seconds.

H3K36me3_AM_immune_PDPA
MSE: 0.119073766409
ACC: 0.929460580913
Took 90.9721779823 seconds.

H3K36me3_TDH_immune_FPOP
MSE: 0.159665076283
ACC: 0.821428571429
Took 21.1072010994 seconds.

H3K36me3_TDH_immune_joint
MSE: 0.153445103742
ACC: 0.888888888889
Took 3.06977796555 seconds.

H3K36me3_TDH_immune_PDPA
MSE: 0.0116450732655
ACC: 0.904761904762
Took 19.9555151463 seconds.

H3K36me3_TDH_other_FPOP
MSE: 0.479629236311
ACC: 0.575
Took 10.7924818993 seconds.

H3K36me3_TDH_other_PDPA
MSE: 0.04380704961
ACC: 0.90625
Took 5.75361895561 seconds.

H3K4me3_PGP_immune_PDPA
MSE: 0.21055520913
ACC: 0.747524752475
Took 412.952256918 seconds.

H3K4me3_TDH_immune_FPOP
MSE: 0.551525301118
ACC: 0.470899470899
Took 351.8476769