diff --git a/README.rst b/README.rst index 529a89c..293b520 100644 --- a/README.rst +++ b/README.rst @@ -114,15 +114,23 @@ combo is featured for: y_test_proba = clf.predict_proba(X_test) # probability prediction +**Key Links and Resources**\ : + + +* `View the latest codes on Github `_ +* `View the documentation & API `_ +* `View all examples `_ + + **Table of Contents**\ : * `Installation <#installation>`_ * `API Cheatsheet & Reference <#api-cheatsheet--reference>`_ * `Implemented Algorithms <#implemented-algorithms>`_ -* `An Example of Stacking <#an-example-of-stacking>`_ -* `Quick Start for Classifier Combination <#quick-start-for-classifier-combination>`_ -* `Quick Start for Clustering Combination <#quick-start-for-clustering-combination>`_ +* `Example of Stacking <#example-of-stacking>`_ +* `Example of Classifier Combination <#example-of-classifier-combination>`_ +* `Example of Clustering Combination <#example-of-clustering-combination>`_ * `Development Status <#development-status>`_ @@ -220,8 +228,14 @@ Anomaly Detection Locally Selective Combination (LSCP) ---- -An Example of Stacking -^^^^^^^^^^^^^^^^^^^^^^ +**All implemented modes** are associated with examples, check +`"combo examples" `_ +for more information. + + +Example of Stacking +^^^^^^^^^^^^^^^^^^^ + `"examples/stacking_example.py" `_ demonstrates the basic API of stacking (meta ensembling). @@ -233,9 +247,11 @@ demonstrates the basic API of stacking (meta ensembling). # initialize a group of classifiers - classifiers = [DecisionTreeClassifier(), LogisticRegression(), - KNeighborsClassifier(), RandomForestClassifier(), - GradientBoostingClassifier()] + classifiers = [DecisionTreeClassifier(random_state=random_state), + LogisticRegression(random_state=random_state), + KNeighborsClassifier(), + RandomForestClassifier(random_state=random_state), + GradientBoostingClassifier(random_state=random_state)] #. Initialize, fit, predict, and evaluate with Stacking @@ -255,7 +271,7 @@ demonstrates the basic API of stacking (meta ensembling). #. See a sample output of stacking_example.py - .. code-block:: python + .. code-block:: bash Decision Tree | Accuracy:0.9386, ROC:0.9383, F1:0.9521 @@ -270,8 +286,9 @@ demonstrates the basic API of stacking (meta ensembling). ---- -Quick Start for Classifier Combination -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Example of Classifier Combination +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + `"examples/classifier_comb_example.py" `_ demonstrates the basic API of predicting with multiple classifiers. **It is noted that the API across all other algorithms are consistent/similar**. @@ -282,9 +299,11 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note # initialize a group of classifiers - classifiers = [DecisionTreeClassifier(), LogisticRegression(), - KNeighborsClassifier(), RandomForestClassifier(), - GradientBoostingClassifier()] + classifiers = [DecisionTreeClassifier(random_state=random_state), + LogisticRegression(random_state=random_state), + KNeighborsClassifier(), + RandomForestClassifier(random_state=random_state), + GradientBoostingClassifier(random_state=random_state)] #. Initialize, fit, predict, and evaluate with a simple aggregator (average) @@ -303,7 +322,7 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note #. See a sample output of classifier_comb_example.py - .. code-block:: python + .. code-block:: bash Decision Tree | Accuracy:0.9386, ROC:0.9383, F1:0.9521 @@ -322,8 +341,9 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note ---- -Quick Start for Clustering Combination -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Example of Clustering Combination +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + `"examples/cluster_comb_example.py" `_ demonstrates the basic API of combining multiple base clustering estimators. @@ -333,8 +353,6 @@ demonstrates the basic API of combining multiple base clustering estimators. .. code-block:: python - from combo.models.cluster_comb import ClustererEnsemble - # Initialize a set of estimators estimators = [KMeans(n_clusters=n_clusters), MiniBatchKMeans(n_clusters=n_clusters), @@ -346,6 +364,7 @@ demonstrates the basic API of combining multiple base clustering estimators. .. code-block:: python + from combo.models.cluster_comb import ClustererEnsemble # combine by Clusterer Ensemble clf = ClustererEnsemble(estimators, n_clusters=n_clusters) clf.fit(X) @@ -361,6 +380,63 @@ demonstrates the basic API of combining multiple base clustering estimators. predicted_labels = clf.labels_ + +Example of Outlier Detection Combination +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +`"examples/detector_comb_example.py" `_ +demonstrates the basic API of combining multiple base outlier detectors. + +#. Initialize a group of outlier detection methods as base estimators + + .. code-block:: python + + + # Initialize a set of estimators + detectors = [KNN(), LOF(), OCSVM()] + + +#. Initialize a simple averaging aggregator, fit the model, and make + the prediction. + + .. code-block:: python + + + from combo.models.detector combination import SimpleDetectorAggregator + clf = SimpleDetectorAggregator(base_estimators=detectors) + clf_name = 'Aggregation by Averaging' + clf.fit(X_train) + + y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) + y_train_scores = clf.decision_scores_ # raw outlier scores + + # get the prediction on the test data + y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) + y_test_scores = clf.decision_function(X_test) # outlier scores + + +#. Evaluate the prediction using ROC and Precision @ Rank n. + + .. code-block:: python + + # evaluate and print the results + print("\nOn Training Data:") + evaluate_print(clf_name, y_train, y_train_scores) + print("\nOn Test Data:") + evaluate_print(clf_name, y_test, y_test_scores) + +#. See sample outputs on both training and test data. + + .. code-block:: bash + + On Training Data: + Aggregation by Averaging ROC:0.9994, precision @ rank n:0.95 + + On Test Data: + Aggregation by Averaging ROC:1.0, precision @ rank n:1.0 + + ---- diff --git a/docs/example.rst b/docs/example.rst index 0830da8..463d0a9 100644 --- a/docs/example.rst +++ b/docs/example.rst @@ -2,11 +2,17 @@ Examples by Tasks ================= +**All implemented modes** are associated with examples, check +`"combo examples" `_ +for more information. + + ---- -An Example of Stacking -^^^^^^^^^^^^^^^^^^^^^^ +Example of Stacking +^^^^^^^^^^^^^^^^^^^ + `"examples/stacking_example.py" `_ demonstrates the basic API of stacking (meta ensembling). @@ -42,7 +48,7 @@ demonstrates the basic API of stacking (meta ensembling). #. See a sample output of stacking_example.py - .. code-block:: python + .. code-block:: bash Decision Tree | Accuracy:0.9386, ROC:0.9383, F1:0.9521 @@ -57,8 +63,9 @@ demonstrates the basic API of stacking (meta ensembling). ---- -Quick Start for Classifier Combination -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Example of Classifier Combination +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + `"examples/classifier_comb_example.py" `_ demonstrates the basic API of predicting with multiple classifiers. **It is noted that the API across all other algorithms are consistent/similar**. @@ -92,7 +99,7 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note #. See a sample output of classifier_comb_example.py - .. code-block:: python + .. code-block:: bash Decision Tree | Accuracy:0.9386, ROC:0.9383, F1:0.9521 @@ -111,8 +118,9 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note ---- -Quick Start for Clustering Combination -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Example of Clustering Combination +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + `"examples/cluster_comb_example.py" `_ demonstrates the basic API of combining multiple base clustering estimators. @@ -122,8 +130,6 @@ demonstrates the basic API of combining multiple base clustering estimators. .. code-block:: python - from combo.models.cluster_comb import ClustererEnsemble - # Initialize a set of estimators estimators = [KMeans(n_clusters=n_clusters), MiniBatchKMeans(n_clusters=n_clusters), @@ -135,6 +141,7 @@ demonstrates the basic API of combining multiple base clustering estimators. .. code-block:: python + from combo.models.cluster_comb import ClustererEnsemble # combine by Clusterer Ensemble clf = ClustererEnsemble(estimators, n_clusters=n_clusters) clf.fit(X) @@ -151,4 +158,59 @@ demonstrates the basic API of combining multiple base clustering estimators. +Example of Outlier Detection Combination +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +`"examples/detector_comb_example.py" `_ +demonstrates the basic API of combining multiple base outlier detectors. + +#. Initialize a group of outlier detection methods as base estimators + + .. code-block:: python + + + # Initialize a set of estimators + detectors = [KNN(), LOF(), OCSVM()] + + +#. Initialize a simple averaging aggregator, fit the model, and make + the prediction. + + .. code-block:: python + + + from combo.models.detector combination import SimpleDetectorAggregator + clf = SimpleDetectorAggregator(base_estimators=detectors) + clf_name = 'Aggregation by Averaging' + clf.fit(X_train) + + y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) + y_train_scores = clf.decision_scores_ # raw outlier scores + + # get the prediction on the test data + y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) + y_test_scores = clf.decision_function(X_test) # outlier scores + + +#. Evaluate the prediction using ROC and Precision @ Rank n. + + .. code-block:: python + + # evaluate and print the results + print("\nOn Training Data:") + evaluate_print(clf_name, y_train, y_train_scores) + print("\nOn Test Data:") + evaluate_print(clf_name, y_test, y_test_scores) + +#. See sample outputs on both training and test data. + + .. code-block:: bash + + On Training Data: + Aggregation by Averaging ROC:0.9994, precision @ rank n:0.95 + + On Test Data: + Aggregation by Averaging ROC:1.0, precision @ rank n:1.0 + diff --git a/docs/index.rst b/docs/index.rst index d4d13c9..023b1d9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,7 +1,7 @@ .. combo documentation master file, created by - sphinx-quickstart on Tue Jul 16 15:42:55 2019. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +sphinx-quickstart on Tue Jul 16 15:42:55 2019. +You can adapt this file completely to your liking, but it should at least +contain the root `toctree` directive. Welcome to combo's documentation! ================================= @@ -119,6 +119,14 @@ combo is featured for: y_test_proba = clf.predict_proba(X_test) # probability prediction +**Key Links and Resources**\ : + + +* `View the latest codes on Github `_ +* `View the documentation & API `_ +* `View all examples `_ + + ---- diff --git a/examples/temp_do_not_use.py b/examples/temp_do_not_use.py index 703d23e..04173fd 100644 --- a/examples/temp_do_not_use.py +++ b/examples/temp_do_not_use.py @@ -20,118 +20,45 @@ import warnings warnings.filterwarnings("ignore") -import numpy as np -from numpy import percentile -import matplotlib.pyplot as plt -import matplotlib.font_manager - -# Import all models -from sklearn.tree import DecisionTreeClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.svm import SVC -from sklearn.neighbors import KNeighborsClassifier - -from combo.models.classifier_comb import SimpleClassifierAggregator -from combo.models.stacking import Stacking - -# Define the number of inliers and outliers -n_samples = 300 -outliers_fraction = 0.5 -clusters_separation = [3] - -# Compare given detectors under given settings -# Initialize the data -xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100)) -n_inliers = int((1. - outliers_fraction) * n_samples) -n_outliers = int(outliers_fraction * n_samples) -ground_truth = np.zeros(n_samples, dtype=int) -ground_truth[-n_outliers:] = 1 - -# Show the statics of the data -print('Number of Class 0: %i' % n_inliers) -print('Number of Class 1: %i' % n_outliers) -print('Ground truth shape is {shape}.\n'.format(shape=ground_truth.shape)) -print(ground_truth, '\n') - -random_state = np.random.RandomState(42) - -classifiers = [LogisticRegression(), GaussianNB(), SVC(probability=True), - KNeighborsClassifier()] - -# Define nine outlier detection tools to be compared -classifiers = { - # 'Decision Tree': DecisionTreeClassifier(), - 'Logistic Regression': LogisticRegression(), - # 'Ada': AdaBoostClassifier(random_state=random_state), - # 'Random Forest': RandomForestClassifier(random_state=random_state), - 'Gaussian NB': GaussianNB(), - 'Support Vector Machine': SVC(probability=True), - 'k Nearst Neighbors': KNeighborsClassifier(), - 'Simple Average': SimpleClassifierAggregator(base_estimators=classifiers, - method='average'), - 'Simple Maximization': SimpleClassifierAggregator( - base_estimators=classifiers, method='maximization'), - 'Stacking': Stacking(base_estimators=classifiers, shuffle_data=True), - 'Stacking_RF': Stacking(base_estimators=classifiers, shuffle_data=True, - meta_clf=RandomForestClassifier( - random_state=random_state)) -} - -# Show all detectors -for i, clf in enumerate(classifiers.keys()): - print('Model', i + 1, clf) - -# Fit the models with the generated data and -# compare model performances -for i, offset in enumerate(clusters_separation): - np.random.seed(42) - # Data generation - X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset - X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset - X = np.r_[X1, X2] - # Add outliers - X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] - - # Fit the model - plt.figure(figsize=(15, 8)) - for i, (clf_name, clf) in enumerate(classifiers.items()): - print() - print(i + 1, 'fitting', clf_name) - # fit the data and tag outliers - - clf.fit(X, ground_truth) - scores_pred = clf.predict_proba(X)[:, 1] * -1 - - y_pred = clf.predict(X) - threshold = percentile(scores_pred, 100 * outliers_fraction) - n_errors = (y_pred != ground_truth).sum() - # plot the levels lines and the points - Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] * -1 - Z = Z.reshape(xx.shape) - subplot = plt.subplot(2, 4, i + 1) - subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), - cmap=plt.cm.Blues_r) - a = subplot.contour(xx, yy, Z, levels=[threshold], - linewidths=2, colors='red') - subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], - colors='orange') - b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', - s=20, edgecolor='k') - c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', - s=20, edgecolor='k') - subplot.axis('tight') - subplot.legend( - [a.collections[0], b, c], - ['learned boundary', 'class 0', 'class 1'], - prop=matplotlib.font_manager.FontProperties(size=10), - loc='lower right') - subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) - subplot.set_xlim((-7, 7)) - subplot.set_ylim((-7, 7)) - plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) - plt.suptitle("Model Combination") -plt.savefig('ALL.png', dpi=300) -plt.show() + +from pyod.models.knn import KNN +from pyod.models.lof import LOF +from pyod.models.ocsvm import OCSVM +from pyod.utils.data import generate_data +from pyod.utils.data import evaluate_print +from pyod.utils.example import visualize + +from combo.models.detector_comb import SimpleDetectorAggregator + +contamination = 0.1 # percentage of outliers +n_train = 200 # number of training points +n_test = 100 # number of testing points + +# Generate sample data +X_train, y_train, X_test, y_test = \ + generate_data(n_train=n_train, + n_test=n_test, + n_features=2, + contamination=contamination, + random_state=42) + +detectors = [KNN(), LOF(), OCSVM()] + +clf = SimpleDetectorAggregator(base_estimators=detectors) +clf.fit(X_train) + +# get the prediction labels and outlier scores of the training data +y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) +y_train_scores = clf.decision_scores_ # raw outlier scores + +# get the prediction on the test data +y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) +y_test_scores = clf.decision_function(X_test) # outlier scores + +# evaluate and print the results +print("\nOn Training Data:") +evaluate_print('Average', y_train, y_train_scores) +print("\nOn Test Data:") +evaluate_print('Average', y_test, y_test_scores) + +