Skip to content

Commit

Permalink
State check
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 authored and yuezhao@cs.toronto.edu committed Jun 3, 2018
1 parent 5374e33 commit fec5e56
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 17 deletions.
49 changes: 37 additions & 12 deletions examples/feat_bagging_example.py
Expand Up @@ -26,31 +26,56 @@
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.hbos import HBOS
from pyod.models.base import clone
from pyod.utils.data import generate_data
from pyod.utils.utility import precision_n_scores
from sklearn.utils.estimator_checks import check_estimator
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from scipy.io import loadmat
from pyod.models.combination import average
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import LocalOutlierFactor
import numpy as np

if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 100
n_test = 50
# contamination = 0.1 # percentage of outliers
# n_train = 100
# n_test = 50
#
# X_train, y_train, X_test, y_test = generate_data(
# n_train=n_train, n_test=n_test, contamination=contamination)
mat_file = 'cardio.mat'

X_train, y_train, c_train, X_test, y_test, c_test = generate_data(
n_train=n_train, n_test=n_test, contamination=contamination)
try:
mat = loadmat(os.path.join('example_data', mat_file))

X = np.asarray([[1, 2],
[3, 4],
[5, 6]])
w = [[0.2], [0.6]]
except TypeError:
print('{data_file} does not exist. Use generated data'.format(
data_file=mat_file))
X, y = generate_data(train_only=True) # load data
except IOError:
print('{data_file} does not exist. Use generated data'.format(
data_file=mat_file))
X, y = generate_data(train_only=True) # load data
else:
X = mat['X']
y = mat['y'].ravel()

average(X, w)

# TODO: place holder only
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.4,
random_state=2)

# contamination = 0.1 # percentage of outliers
# n_train = 1000
# n_test = 500
#
# X_train, y_train, X_test, y_test = generate_data(
# n_train=n_train, n_test=n_test, contamination=contamination)
clf_name = 'FBagging'
clf = FeatureBagging(base_estimator=KNN(), random_state=2)
clf.fit(X_train)
95 changes: 90 additions & 5 deletions pyod/models/feat_bagging.py
Expand Up @@ -11,20 +11,105 @@
from pyod.utils.data import generate_data
from sklearn.base import BaseEstimator

from sklearn.ensemble import RandomForestClassifier

import numpy as np
from sklearn.utils.validation import check_random_state
from sklearn.utils import check_array
from sklearn.utils.random import sample_without_replacement
from .base import BaseDetector

MAX_INT = np.iinfo(np.int32).max


def _generate_indices(random_state, bootstrap, n_population, n_samples):
"""
Draw randomly sampled indices.
See sklearn/ensemble/bagging.py
"""
# Draw sample indices
if bootstrap:
indices = random_state.randint(0, n_population, n_samples)
else:
indices = sample_without_replacement(n_population, n_samples,
random_state=random_state)

return indices


def _generate_bagging_indices(random_state, bootstrap_features, n_features,
min_features, max_features):
"""
Randomly draw feature indices.
Modified from sklearn/ensemble/bagging.py
"""
# Get valid random state
random_state = check_random_state(random_state)

# decide number of features to draw
random_n_features = random_state.randint(min_features, max_features)

# Draw indices
feature_indices = _generate_indices(random_state, bootstrap_features,
n_features, random_n_features)

return feature_indices


# TODO: place holder only
class FeatureBagging(BaseDetector):
"""
place holder only
"""

def __init__(self, base_estimator, n_estimators=10, contamination=0.1,
min_features=0.5):
min_features=0.5, max_features=1,
bootstrap_features=False, random_state=None):
super(FeatureBagging, self).__init__(contamination=contamination)
self.base_estimator_ = base_estimator
self.n_estimators_ = n_estimators
self.min_features_ = min_features
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.min_features = min_features
self.max_features = max_features
self.bootstrap_features = bootstrap_features
self.random_state = random_state

def fit(self, X, y=None):
pass
random_state = check_random_state(self.random_state)

X = check_array(X)
self.n_features_ = X.shape[1]

# TODO add a check for min_features, e.g. d<=3 & max_features as well
# at least 0.5 of total
self.min_features_ = int(self.n_features_ * self.min_features)
self.max_features_ = int(self.n_features_ * self.max_features)

self.estimators_ = []
self.estimators_features_ = []

n_more_estimators = self.n_estimators - len(self.estimators_)

if n_more_estimators < 0:
raise ValueError('n_estimators=%d must be larger or equal to '
'len(estimators_)=%d when warm_start==True'
% (self.n_estimators, len(self.estimators_)))

seeds = random_state.randint(MAX_INT, size=n_more_estimators)
self._seeds = seeds

for i in range(self.n_estimators):
random_state = np.random.RandomState(seeds[i])

features = _generate_bagging_indices(random_state,
self.bootstrap_features,
self.n_features_,
self.min_features_,
self.max_features_)

self.estimators_features_.append(features)

def decision_function(self, X):
pass

0 comments on commit fec5e56

Please sign in to comment.