State check

yzhao062 · Jun 3, 2018 · fec5e56 · fec5e56
1 parent 5374e33
commit fec5e56
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 17 deletions.
diff --git a/examples/feat_bagging_example.py b/examples/feat_bagging_example.py
@@ -26,31 +26,56 @@
 from pyod.models.knn import KNN
 from pyod.models.lof import LOF
 from pyod.models.iforest import IForest
+from pyod.models.hbos import HBOS
 from pyod.models.base import clone
 from pyod.utils.data import generate_data
 from pyod.utils.utility import precision_n_scores
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import IsolationForest
+from sklearn.model_selection import train_test_split
 
+from scipy.io import loadmat
 from pyod.models.combination import average
+from sklearn.ensemble import RandomForestClassifier
 
 from sklearn.neighbors import LocalOutlierFactor
 import numpy as np
 
 if __name__ == "__main__":
-    contamination = 0.1  # percentage of outliers
-    n_train = 100
-    n_test = 50
+    # contamination = 0.1  # percentage of outliers
+    # n_train = 100
+    # n_test = 50
+    #
+    # X_train, y_train, X_test, y_test = generate_data(
+    #     n_train=n_train, n_test=n_test, contamination=contamination)
+    mat_file = 'cardio.mat'
 
-    X_train, y_train, c_train, X_test, y_test, c_test = generate_data(
-        n_train=n_train, n_test=n_test, contamination=contamination)
+    try:
+        mat = loadmat(os.path.join('example_data', mat_file))
 
-    X = np.asarray([[1, 2],
-                    [3, 4],
-                    [5, 6]])
-    w = [[0.2], [0.6]]
+    except TypeError:
+        print('{data_file} does not exist. Use generated data'.format(
+            data_file=mat_file))
+        X, y = generate_data(train_only=True)  # load data
+    except IOError:
+        print('{data_file} does not exist. Use generated data'.format(
+            data_file=mat_file))
+        X, y = generate_data(train_only=True)  # load data
+    else:
+        X = mat['X']
+        y = mat['y'].ravel()
 
-    average(X, w)
-
-# TODO: place holder only
+    X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                        test_size=0.4,
+                                                        random_state=2)
+
+    # contamination = 0.1  # percentage of outliers
+    # n_train = 1000
+    # n_test = 500
+    #
+    # X_train, y_train, X_test, y_test = generate_data(
+    #     n_train=n_train, n_test=n_test, contamination=contamination)
+    clf_name = 'FBagging'
+    clf = FeatureBagging(base_estimator=KNN(), random_state=2)
+    clf.fit(X_train)
diff --git a/pyod/models/feat_bagging.py b/pyod/models/feat_bagging.py
@@ -11,20 +11,105 @@
 from pyod.utils.data import generate_data
 from sklearn.base import BaseEstimator
 
+from sklearn.ensemble import RandomForestClassifier
+
+import numpy as np
+from sklearn.utils.validation import check_random_state
+from sklearn.utils import check_array
+from sklearn.utils.random import sample_without_replacement
 from .base import BaseDetector
 
+MAX_INT = np.iinfo(np.int32).max
+
+
+def _generate_indices(random_state, bootstrap, n_population, n_samples):
+    """
+    Draw randomly sampled indices.
+
+    See sklearn/ensemble/bagging.py
+    """
+    # Draw sample indices
+    if bootstrap:
+        indices = random_state.randint(0, n_population, n_samples)
+    else:
+        indices = sample_without_replacement(n_population, n_samples,
+                                             random_state=random_state)
+
+    return indices
+
+
+def _generate_bagging_indices(random_state, bootstrap_features, n_features,
+                              min_features, max_features):
+    """
+    Randomly draw feature indices.
+
+    Modified from sklearn/ensemble/bagging.py
+    """
+    # Get valid random state
+    random_state = check_random_state(random_state)
+
+    # decide number of features to draw
+    random_n_features = random_state.randint(min_features, max_features)
+
+    # Draw indices
+    feature_indices = _generate_indices(random_state, bootstrap_features,
+                                        n_features, random_n_features)
+
+    return feature_indices
+
 
 # TODO: place holder only
 class FeatureBagging(BaseDetector):
+    """
+    place holder only
+
+    """
+
     def __init__(self, base_estimator, n_estimators=10, contamination=0.1,
-                 min_features=0.5):
+                 min_features=0.5, max_features=1,
+                 bootstrap_features=False, random_state=None):
         super(FeatureBagging, self).__init__(contamination=contamination)
-        self.base_estimator_ = base_estimator
-        self.n_estimators_ = n_estimators
-        self.min_features_ = min_features
+        self.base_estimator = base_estimator
+        self.n_estimators = n_estimators
+        self.min_features = min_features
+        self.max_features = max_features
+        self.bootstrap_features = bootstrap_features
+        self.random_state = random_state
 
     def fit(self, X, y=None):
-        pass
+        random_state = check_random_state(self.random_state)
+
+        X = check_array(X)
+        self.n_features_ = X.shape[1]
+
+        # TODO add a check for min_features, e.g. d<=3 & max_features as well
+        # at least 0.5 of total
+        self.min_features_ = int(self.n_features_ * self.min_features)
+        self.max_features_ = int(self.n_features_ * self.max_features)
+
+        self.estimators_ = []
+        self.estimators_features_ = []
+
+        n_more_estimators = self.n_estimators - len(self.estimators_)
+
+        if n_more_estimators < 0:
+            raise ValueError('n_estimators=%d must be larger or equal to '
+                             'len(estimators_)=%d when warm_start==True'
+                             % (self.n_estimators, len(self.estimators_)))
+
+        seeds = random_state.randint(MAX_INT, size=n_more_estimators)
+        self._seeds = seeds
+
+        for i in range(self.n_estimators):
+            random_state = np.random.RandomState(seeds[i])
+
+            features = _generate_bagging_indices(random_state,
+                                                 self.bootstrap_features,
+                                                 self.n_features_,
+                                                 self.min_features_,
+                                                 self.max_features_)
+
+            self.estimators_features_.append(features)
 
     def decision_function(self, X):
         pass