Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Imbalanced data stream ensembles. #27

Merged
merged 6 commits into from
Jun 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci_scripts/travis/install.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pip install --upgrade pytest pytest-cov codecov matplotlib coverage==4.4 coveralls tqdm numpy
pip install --upgrade pytest pytest-cov codecov matplotlib coverage==4.4 coveralls tqdm numpy imbalanced-learn
python setup.py develop
195 changes: 195 additions & 0 deletions strlearn/ensembles/KMC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
from sklearn.base import ClassifierMixin, clone
from sklearn.ensemble import BaseEnsemble
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
import numpy as np


class KMC(ClassifierMixin, BaseEnsemble):

"""
References
----------
.. [1] Wang, Yi, Yang Zhang, and Yong Wang. "Mining data streams
with skewed distribution by static classifier ensemble."
Opportunities and Challenges for Next-Generation Applied
Intelligence. Springer, Berlin, Heidelberg, 2009. 65-71.
"""

def __init__(self,
base_estimator=None,
n_estimators=10):

self.base_estimator = base_estimator
self.n_estimators = n_estimators

def fit(self, X, y):
"""Fitting."""
self.partial_fit(X, y)
return self

def partial_fit(self, X, y, classes=None):
"""Partial fitting."""
X, y = check_X_y(X, y)

if not hasattr(self, "ensemble_"):
self.ensemble_ = []
self.weights_ = []

# Check if is more than one class
if len(np.unique(y)) == 1:
raise ValueError("Only one class in data chunk.")

# Check feature consistency
if hasattr(self, "X_"):
if self.X_.shape[1] != X.shape[1]:
raise ValueError("number of features does not match")
self.X_, self.y_ = X, y

# Check classes
self.classes_ = classes
if self.classes_ is None:
self.classes_, _ = np.unique(y, return_inverse=True)

# Find minority and majority names
if not hasattr(self, "minority_name") or not hasattr(self, "majority_name"):
self.minority_name, self.majority_name = self.minority_majority_name(y)

# Resample data
res_X, res_y = self._resample(X, y)

# Train new model
new_classifier = clone(self.base_estimator).fit(res_X, res_y)

if len(self.ensemble_) < self.n_estimators:

# Append new estimator
self.ensemble_.append(new_classifier)
self.weights_.append(1)

else:

# Remove the worst model when ensemble becomes too large
auc_array = []

for i in range(len(self.ensemble_)):
y_score = self.ensemble_[i].predict_proba(res_X)
auc_array.append(roc_auc_score(res_y, y_score[:, 1]))

j = np.argmin(auc_array)

y_score = new_classifier.predict_proba(res_X)
new_auc = roc_auc_score(res_y, y_score[:, 1])

if new_auc > auc_array[j]:
self.ensemble_[j] = new_classifier
auc_array[j] = new_auc

for i in range(len(self.ensemble_)):
self.weights_[i] = auc_array[i]

def _resample(self, X, y):
minority, majority = self.minority_majority_split(X, y,
self.minority_name,
self.majority_name)

# Undersample majority array
km = KMeans(n_clusters=len(minority)).fit(X)
majority = km.cluster_centers_

res_X = np.concatenate((majority, minority), axis=0)
res_y = len(majority)*[self.majority_name] + len(minority)*[self.minority_name]

return res_X, res_y

def ensemble_support_matrix(self, X):
"""Ensemble support matrix."""
return np.array([member_clf.predict_proba(X) for member_clf in self.ensemble_])

def predict_proba(self, X):
esm = self.ensemble_support_matrix(X)
average_support = np.mean(esm, axis=0)
return average_support

def predict(self, X):
"""
Predict classes for X.

Parameters
----------
X : array-like, shape (n_samples, n_features)
The training input samples.

Returns
-------
y : array-like, shape (n_samples, )
The predicted classes.
"""

# Check is fit had been called
check_is_fitted(self, "classes_")
X = check_array(X)
if X.shape[1] != self.X_.shape[1]:
raise ValueError("number of features does not match")

esm = self.ensemble_support_matrix(X)
esm = esm * np.array(self.weights_)[:, np.newaxis, np.newaxis]
average_support = np.mean(esm, axis=0)
prediction = np.argmax(average_support, axis=1)

# Return prediction
return self.classes_[prediction]

def minority_majority_split(self, X, y, minority_name, majority_name):
"""Returns minority and majority data

Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values.

Returns
-------
minority : array-like, shape = [n_samples, n_features]
Minority class samples.
majority : array-like, shape = [n_samples, n_features]
Majority class samples.
"""

minority_ma = np.ma.masked_where(y == minority_name, y)
minority = X[minority_ma.mask]

majority_ma = np.ma.masked_where(y == majority_name, y)
majority = X[majority_ma.mask]

return minority, majority

def minority_majority_name(self, y):
"""Returns the name of minority and majority class

Parameters
----------
y : array-like, shape = [n_samples]
The target values.

Returns
-------
minority_name : object
Name of minority class.
majority_name : object
Name of majority class.
"""

unique, counts = np.unique(y, return_counts=True)

if counts[0] > counts[1]:
majority_name = unique[0]
minority_name = unique[1]
else:
majority_name = unique[1]
minority_name = unique[0]

return minority_name, majority_name
195 changes: 195 additions & 0 deletions strlearn/ensembles/OUSE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
from sklearn.base import ClassifierMixin, clone
from sklearn.ensemble import BaseEnsemble
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
import numpy as np
import random


class OUSE(ClassifierMixin, BaseEnsemble):

"""
References
----------
.. [1] Gao, Jing, et al. "Classifying Data Streams with Skewed Class
Distributions and Concept Drifts." IEEE Internet Computing 12.6
(2008): 37-49.
"""

def __init__(self,
base_estimator=None,
n_estimators=10,
n_chunks=10):

self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.minority_data = []
self.chunk_time_stamp = []
self.chunk_sample_proba = []
self.n_chunks = n_chunks
self.time_stamp = 1

def fit(self, X, y):
"""Fitting."""
self.partial_fit(X, y)
return self

def partial_fit(self, X, y, classes=None):
"""Partial fitting."""
X, y = check_X_y(X, y)

if not hasattr(self, "ensemble_"):
self.ensemble_ = []

# Check if is more than one class
if len(np.unique(y)) == 1:
raise ValueError("Only one class in data chunk.")

# Check feature consistency
if hasattr(self, "X_"):
if self.X_.shape[1] != X.shape[1]:
raise ValueError("number of features does not match")
self.X_, self.y_ = X, y

# Check classes
self.classes_ = classes
if self.classes_ is None:
self.classes_, _ = np.unique(y, return_inverse=True)

# Find minority and majority names
if not hasattr(self, "minority_name") or not hasattr(self, "majority_name"):
self.minority_name, self.majority_name = self.minority_majority_name(y)

new_minority = self._resample(X, y)
minority, majority = self.minority_majority_split(X, y, self.minority_name, self.majority_name)

majority_split = np.array_split(majority, self.n_estimators)

self.ensemble_ = []
for m_s in majority_split:
res_X = np.concatenate((m_s, new_minority), axis=0) # maj = self.label_encoder.inverse_transform(maj)

res_y = len(m_s)*[self.majority_name] + len(new_minority)*[self.minority_name]
new_classifier = clone(self.base_estimator).fit(res_X, res_y)
self.ensemble_.append(new_classifier)

self.time_stamp += 1

def _resample(self, X, y):
y = np.array(y)
X = np.array(X)

minority, majority = self.minority_majority_split(X, y, self.minority_name, self.majority_name)

self.minority_data.append(minority.tolist())
self.chunk_time_stamp.append(self.time_stamp)

if len(self.minority_data) > self.n_chunks:
del self.minority_data[0]
del self.chunk_time_stamp[0]

self.chunk_sample_proba = np.arange(len(self.minority_data))+1
self.chunk_sample_proba = self.chunk_sample_proba / self.chunk_sample_proba.sum()

number_of_instances = len(majority)/self.n_estimators

chunk_indexes = np.random.choice(len(self.chunk_sample_proba), int(number_of_instances), p=self.chunk_sample_proba)
cia, cca = np.unique(chunk_indexes, return_counts=True)

new_minority = []
for chunk_index, chunk_count in zip(cia, cca):
if len(self.minority_data[chunk_index]) > chunk_count:
new_minority.extend(random.sample(self.minority_data[chunk_index], chunk_count))
else:
new_minority.extend(self.minority_data[chunk_index])

return new_minority

def ensemble_support_matrix(self, X):
"""Ensemble support matrix."""
return np.array([member_clf.predict_proba(X) for member_clf in self.ensemble_])

def predict_proba(self, X):
esm = self.ensemble_support_matrix(X)
average_support = np.mean(esm, axis=0)
return average_support

def predict(self, X):
"""
Predict classes for X.

Parameters
----------
X : array-like, shape (n_samples, n_features)
The training input samples.

Returns
-------
y : array-like, shape (n_samples, )
The predicted classes.
"""

# Check is fit had been called
check_is_fitted(self, "classes_")
X = check_array(X)
if X.shape[1] != self.X_.shape[1]:
raise ValueError("number of features does not match")

esm = self.ensemble_support_matrix(X)
average_support = np.mean(esm, axis=0)
prediction = np.argmax(average_support, axis=1)

# Return prediction
return self.classes_[prediction]

def minority_majority_split(self, X, y, minority_name, majority_name):
"""Returns minority and majority data

Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values.

Returns
-------
minority : array-like, shape = [n_samples, n_features]
Minority class samples.
majority : array-like, shape = [n_samples, n_features]
Majority class samples.
"""

minority_ma = np.ma.masked_where(y == minority_name, y)
minority = X[minority_ma.mask]

majority_ma = np.ma.masked_where(y == majority_name, y)
majority = X[majority_ma.mask]

return minority, majority

def minority_majority_name(self, y):
"""Returns the name of minority and majority class

Parameters
----------
y : array-like, shape = [n_samples]
The target values.

Returns
-------
minority_name : object
Name of minority class.
majority_name : object
Name of majority class.
"""

unique, counts = np.unique(y, return_counts=True)

if counts[0] > counts[1]:
majority_name = unique[0]
minority_name = unique[1]
else:
majority_name = unique[1]
minority_name = unique[0]

return minority_name, majority_name
Loading