In [2]:
# Import necessary libraries
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load a sample dataset (Iris dataset in this example)
data = load_iris()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a base classifier (Decision Tree in this example)
base_classifier = DecisionTreeClassifier(random_state=42)

# Create a BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Fit the BaggingClassifier on the training data
bagging_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging_classifier.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Create a synthetic dataset (binary classification)
np.random.seed(42)
X = np.random.rand(100, 2)  # 100 samples with 2 features
y = (X[:, 0] + X[:, 1] > 1).astype(int)  # Binary target variable

# Number of base classifiers (trees) in the ensemble
num_classifiers = 5

# Number of data points to use for each bootstrap sample
bootstrap_size = len(X)

# Create an ensemble of Decision Tree classifiers
classifiers = []

for _ in range(num_classifiers):
    # Create a bootstrap sample by randomly selecting data points with replacement
    bootstrap_indices = np.random.choice(range(bootstrap_size), size=bootstrap_size, replace=True)
    X_bootstrap = X[bootstrap_indices]
    y_bootstrap = y[bootstrap_indices]
    
    # Create and train a Decision Tree classifier on the bootstrap sample
    classifier = DecisionTreeClassifier()
    classifier.fit(X_bootstrap, y_bootstrap)
    
    # Append the trained classifier to the ensemble
    classifiers.append(classifier)

# Make predictions using each classifier and aggregate the results by majority voting
def ensemble_predict(classifiers, X):
    predictions = np.array([classifier.predict(X) for classifier in classifiers])
    # Use np.mean to obtain the majority vote (mode) of the predictions
    ensemble_predictions = np.mean(predictions, axis=0)
    return ensemble_predictions.round().astype(int)

# Test the ensemble on a new dataset
new_data_point = np.array([[0.6, 0.4]])  # A new data point to classify
ensemble_prediction = ensemble_predict(classifiers, new_data_point)
print("Ensemble Prediction:", ensemble_prediction)


Ensemble Prediction: [1]


In [4]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Create a synthetic dataset (binary classification)
np.random.seed(42)
X = np.random.rand(100, 2)  # 100 samples with 2 features
y = (X[:, 0] + X[:, 1] > 1).astype(int)  # Binary target variable

# Number of base classifiers (trees) in the ensemble
num_classifiers = 5

# Create an ensemble of Random Forest classifiers
classifiers = []

for _ in range(num_classifiers):
    # Create and train a Random Forest classifier
    classifier = RandomForestClassifier()
    classifier.fit(X, y)  # No need to create bootstrap samples
    
    # Append the trained classifier to the ensemble
    classifiers.append(classifier)

# Make predictions using each classifier and aggregate the results by majority voting
def ensemble_predict(classifiers, X):
    predictions = np.array([classifier.predict(X) for classifier in classifiers])
    # Use np.mean to obtain the majority vote (mode) of the predictions
    ensemble_predictions = np.mean(predictions, axis=0)
    return ensemble_predictions.round().astype(int)

# Test the ensemble on a new dataset
new_data_point = np.array([[0.6, 0.4]])  # A new data point to classify
ensemble_prediction = ensemble_predict(classifiers, new_data_point)
print("Ensemble Prediction:", ensemble_prediction)


Ensemble Prediction: [1]


In [5]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class RandomForestClassifierCustom:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.ensemble = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        for _ in range(self.n_estimators):
            # Create a bootstrap sample
            bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            # Train a decision tree on the bootstrap sample
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
            )
            tree.fit(X_bootstrap, y_bootstrap)

            # Add the trained tree to the ensemble
            self.ensemble.append(tree)

    def predict(self, X):
        # Make predictions using each tree in the ensemble
        predictions = np.array([tree.predict(X) for tree in self.ensemble])

        # Aggregate predictions by majority voting
        ensemble_predictions = np.mean(predictions, axis=0)
        return (ensemble_predictions >= 0.5).astype(int)

# Example usage
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    # Load a sample dataset (Iris dataset in this example)
    data = load_iris()
    X, y = data.data, (data.target == 2).astype(int)  # Binary classification

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Create and train the custom Random Forest classifier
    custom_rf = RandomForestClassifierCustom(n_estimators=100)
    custom_rf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = custom_rf.predict(X_test)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)


Accuracy: 1.0


In [6]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class RandomForestClassifierCustom:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.ensemble = []
        self.oob_predictions = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.oob_predictions = np.zeros((n_samples, self.n_estimators))

        for i in range(self.n_estimators):
            # Create a bootstrap sample
            bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            # Train a decision tree on the bootstrap sample
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
            )
            tree.fit(X_bootstrap, y_bootstrap)

            # Add the trained tree to the ensemble
            self.ensemble.append(tree)

            # Record OOB predictions for this tree
            oob_indices = np.setdiff1d(np.arange(n_samples), bootstrap_indices)
            oob_predictions_i = tree.predict(X[oob_indices])
            self.oob_predictions[oob_indices, i] = oob_predictions_i

    def predict(self, X):
        # Make predictions using each tree in the ensemble
        predictions = np.array([tree.predict(X) for tree in self.ensemble])

        # Aggregate predictions by majority voting
        ensemble_predictions = np.mean(predictions, axis=0)
        return (ensemble_predictions >= 0.5).astype(int)

    def oob_error(self, X, y):
        # Calculate the OOB error
        predicted_classes = np.round(self.oob_predictions.mean(axis=1)).astype(int)
        return 1.0 - np.mean(predicted_classes == y)

# Example usage
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split

    # Load a sample dataset (Iris dataset in this example)
    data = load_iris()
    X, y = data.data, (data.target == 2).astype(int)  # Binary classification

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Create and train the custom Random Forest classifier
    custom_rf = RandomForestClassifierCustom(n_estimators=100)
    custom_rf.fit(X_train, y_train)

    # Calculate and print the out-of-bag error
    oob_error = custom_rf.oob_error(X_train, y_train)
    print("Out-of-Bag Error:", oob_error)


Out-of-Bag Error: 0.35238095238095235


In [7]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Generate a synthetic imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, weights=[0.9, 0.1], random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calculate class weights (inversely proportional to class frequencies)
class_weights = {0: 1.0, 1: 10.0}  # Adjust the weights as needed based on the class imbalance

# Create a Random Forest classifier with class weights
rf_classifier = RandomForestClassifier(class_weight=class_weights, random_state=42)

# Train the model using oversampling with class weights
# This will create synthetic samples for the minority class
X_train_oversampled, y_train_oversampled = X_train, y_train  # Initialize with the original training data

# Find the minority class samples
minority_class_indices = np.where(y_train == 1)[0]

# Oversample the minority class to match the majority class size
oversampling_factor = int(class_weights[0] / class_weights[1])
for _ in range(oversampling_factor - 1):
    X_train_oversampled = np.vstack((X_train_oversampled, X_train[minority_class_indices]))
    y_train_oversampled = np.hstack((y_train_oversampled, y_train[minority_class_indices]))

# Train the model on the oversampled training data
rf_classifier.fit(X_train_oversampled, y_train_oversampled)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.94      0.99      0.96       270
           1       0.78      0.47      0.58        30

    accuracy                           0.93       300
   macro avg       0.86      0.73      0.77       300
weighted avg       0.93      0.93      0.93       300



In [9]:
import numpy as np
import pandas as pd

def process_classweights(minority_class, data, balance_percentage):
    """
    Balances a dataset by oversampling the minority class to a specified percentage.

    Parameters:
    - minority_class: int
        The class label of the minority class.
    - data: numpy array or pandas DataFrame
        The input dataset with features and labels.
    - balance_percentage: float (0 to 1)
        The desired percentage of the minority class samples in the balanced dataset.

    Returns:
    - balanced_data: numpy array or pandas DataFrame
        The balanced dataset with the specified percentage of the minority class.
    """
    # Separate the dataset into features (X) and labels (y)
    if isinstance(data, pd.DataFrame):
        X = data.drop(columns='label').values
        y = data['label'].values
    elif isinstance(data, np.ndarray):
        X = data[:, :-1]
        y = data[:, -1]
    else:
        raise ValueError("Unsupported data type. Use a numpy array or pandas DataFrame.")

    # Find indices of the minority class samples
    minority_indices = np.where(y == minority_class)[0]

    # Calculate the number of minority class samples to achieve the desired balance percentage
    num_minority_samples_needed = int(balance_percentage * len(minority_indices) / (1 - balance_percentage))

    # Oversample the minority class to meet the desired balance percentage
    oversampled_indices = np.random.choice(minority_indices, size=num_minority_samples_needed, replace=True)
    
    # Combine the oversampled minority class samples with the majority class samples
    balanced_indices = np.concatenate((oversampled_indices, np.where(y != minority_class)[0]))

    # Create the balanced dataset
    if isinstance(data, pd.DataFrame):
        balanced_data = data.iloc[balanced_indices]
    elif isinstance(data, np.ndarray):
        balanced_data = np.hstack((X[balanced_indices], y[balanced_indices].reshape(-1, 1)))
    else:
        balanced_data = None
    
    return balanced_data

# Example usage:
X, y = make_classification(n_samples=1000, n_features=20, weights=[0.9, 0.1], random_state=42)
balanced_data = process_classweights(minority_class=1, data=X, balance_percentage=0.5)


In [10]:
balanced_data

array([[-0.6693561 , -0.19806908, -0.87076638, ..., -1.26733697,
         0.26173564,  1.01664321],
       [ 0.09337237,  0.78584826,  0.10575379, ..., -0.12270893,
         0.6934308 ,  0.91136272],
       [-0.90579721,  1.03575674,  0.29514098, ...,  0.83049813,
         0.95404926, -0.5782121 ],
       ...,
       [-0.20013455, -1.46108168,  1.79701652, ..., -1.50280171,
        -1.27473745,  1.60111869],
       [ 0.03935575,  0.24868361, -0.47532342, ...,  0.09912579,
         0.54269228,  1.20827474],
       [ 0.76921528,  0.47076539,  0.16994471, ...,  0.6561162 ,
         0.64333186, -2.02100232]])

In [11]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Generate a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, weights=[0.9, 0.1], random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)

# Train the Random Forest model
rf_classifier.fit(X, y)

# Access the OOB error score
oob_error = 1 - rf_classifier.oob_score_
print(f"Out-of-Bag Error: {oob_error:.2%}")


Out-of-Bag Error: 4.30%
