In [1]:
import sys
if '../' not in sys.path: sys.path.append('../')

In [2]:
%load_ext autoreload
%autoreload 2

from itertools import izip

import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import log_loss

from RandomForestMultivariate import RandomForest
from data.dataset import IrisDataSet, WineDataSet, BupaDataSet

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# data = IrisDataSet().load()
data = WineDataSet().load()
# data = BupaDataSet().load()

X_train, Y_train, X_test, Y_test = data.split(p_test=0.25)

x.shape: (178, 13)
y.shape: (178,) 

y unique: [1.0, 2.0, 3.0]
Y_train:  [ 1.  2.  3.]
Y_test:   [ 1.  2.  3.]


In [4]:
chosen_class = 2
Y_train = (Y_train == chosen_class).astype(int)
Y_test = (Y_test == chosen_class).astype(int)

# Attempt 1

In [5]:
from sklearn.ensemble import VotingClassifier

In [6]:
cls = [('rf', RandomForest(n_trees=100, p_items=1.0, p_features=0.8)),
       ('lr', LogisticRegression(C=1e7, tol=1e-7)),
       ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))]

for i, cl in cls:
    cl.fit(X_train, Y_train)

Fit is complete!


In [7]:
Y_pred = []

for i, cl in cls:
    Y_pred.append(cl.predict_proba(X_test)[:, 1])
    print log_loss(Y_test, Y_pred[-1])

0.0814365749773
0.882731934696
0.12226796034


In [8]:
log_loss(Y_test, np.average(Y_pred, axis=0))

0.092277486662292713

# Attempt 2

In [9]:
class WeightedModel:
    def __init__(self, X, Y):
        self.cls = [
            RandomForest(n_trees=100, p_items=1.0, p_features=0.8),
            LogisticRegression(C=1e7, tol=1e-7),
            GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
        ]
        
        X_train, Y_train, X_test, Y_test = self.generate(X, Y, frac_test=0.3)
        
        Y_pred = []
        for cl in self.cls:
            cl.fit(*self.generate(X_train, Y_train, frac_test=0.2)[:2])
            Y_pred.append(cl.predict_proba(X_test)[:, 1])
            print Y_pred[-1].shape
        
        Y_pred = self.change_dataset(Y_pred)
        print Y_pred.shape
        
        self.predictor = RandomForest(n_trees=100, p_items=1.0, p_features=0.8)
        self.predictor.fit(Y_pred, Y_test)
        
    def generate(self, X, Y, frac_test=0.1):
        # boarder = int(len(Y) * (1.0 - frac_test))
        
        # index = np.random.permutation(len(Y))
        # index_train, index_test = index[:boarder], index[boarder:]
        
        border_p = int(sum(Y == 1) * (1.0 - frac_test))
        border_n = int(sum(Y == 0) * (1.0 - frac_test))
        
        index_p = np.random.permutation(np.where(np.asarray(Y) == 1)[0])
        index_n = np.random.permutation(np.where(np.asarray(Y) == 0)[0])
        
        index_train = np.hstack((index_p[:border_p], index_n[:border_n]))
        index_test  = np.hstack((index_p[border_p:], index_n[border_n:]))
        
        X_train, X_test = X[index_train], X[index_test]
        Y_train, Y_test = Y[index_train], Y[index_test]
        
        return X_train, Y_train, X_test, Y_test
    
    def change_dataset(self, x):
        if np.asarray(x).ndim == 3:
            return np.hstack(x)
        else:
            x = np.asarray(x).T
            x = np.hstack([x, (x > 0.5).sum(axis=1).reshape(-1, 1)])
            return x
    
    def predict_proba(self, X_test):
        Y_pred = []
        for cl in self.cls:
            Y_pred.append(cl.predict_proba(X_test)[:, 1])
            print Y_pred[-1].shape
        
        Y_pred = self.change_dataset(Y_pred)
        print Y_pred.shape
        
        return self.predictor.predict_proba(Y_pred)

In [10]:
model = WeightedModel(X_train, Y_train)

Fit is complete!
(41,)
(41,)
(41,)
(41, 4)
Fit is complete!


In [11]:
Y_pred = model.predict_proba(X_test)
log_loss(Y_test, Y_pred)

(45,)
(45,)
(45,)
(45, 4)


0.083250221227927884