In [None]:
# Mount Google Drive - applicable, if working on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Grid Search cv

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from joblib import dump

# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/D2/training.csv')
val_data = pd.read_csv('/content/drive/MyDrive/D2/validation.csv')
test_data = pd.read_csv('/content/drive/MyDrive/D2/testmod.csv')

# Separate features and labels
X_train = train_data['statement']
y_train = train_data['label']
X_val = val_data['statement']
y_val = val_data['label']
X_test = test_data['statement']
y_test = test_data['label']

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('rf', RandomForestClassifier())
])

# Define the parameter grid
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Save the best model
dump(grid_search.best_estimator_, '/content/drive/MyDrive/D2/D2_Random/grid_rf_model.pkl')


['/content/drive/MyDrive/D2/D2_Random/grid_rf_model.pkl']

# Bayes Optimization

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.2


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from skopt import BayesSearchCV
from skopt.space import Integer
from joblib import dump

# Load datasets
train_data = pd.read_csv('/content/drive/MyDrive/D2/training.csv')
val_data = pd.read_csv('/content/drive/MyDrive/D2/validation.csv')
test_data = pd.read_csv('/content/drive/MyDrive/D2/testmod.csv')

# Assume 'label' is the target and other columns are features
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Combine training and validation data
X_train_combined = pd.concat([X_train, X_val])
y_train_combined = pd.concat([y_train, y_val])

# Preprocessing pipeline
numeric_features = X_train_combined.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train_combined.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # Handle unknown categories during testing
    ])

# Define the Random Forest model and hyperparameter space
rf_model = RandomForestClassifier(random_state=42)
param_space = {
    'classifier__n_estimators': Integer(100, 300),
    'classifier__max_depth': Integer(10, 30),
    'classifier__min_samples_split': Integer(2, 10),
    'classifier__min_samples_leaf': Integer(1, 4)
}

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf_model)])

# Initialize Bayesian Optimization
bayes_search = BayesSearchCV(estimator=pipeline, search_spaces=param_space, n_iter=30, cv=3, n_jobs=-1, verbose=2, random_state=42)

# Fit the model
bayes_search.fit(X_train_combined, y_train_combined)

# Save the best model
model_save_path = '/content/drive/MyDrive/D2/D2_Random/bayes_rf_model.pkl'
dump(bayes_search.best_estimator_, model_save_path)

# Evaluate the best model on the test set
y_pred = bayes_search.best_estimator_.predict(X_test) # No need to replace unknown categories now
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

# PBT

In [None]:
!pip install ray[tune]



In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from joblib import dump
import ray

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Load datasets (replace paths with your actual paths)
train_data = pd.read_csv('/content/drive/MyDrive/D2/training.csv')
val_data = pd.read_csv('/content/drive/MyDrive/D2/validation.csv')
test_data = pd.read_csv('/content/drive/MyDrive/D2/testmod.csv')

# Assume 'label' is the target and other columns are features
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Preprocessing pipeline
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit and transform on combined training and validation data
X_train_combined = pd.concat([X_train, X_val])
X_train_combined_processed = preprocessor.fit_transform(X_train_combined)

# Transform test data
X_test_processed = preprocessor.transform(X_test)

# Put large data objects into Ray object store
X_train_combined_ref = ray.put(X_train_combined_processed)
y_train_combined_ref = ray.put(pd.concat([y_train, y_val]))

# Define the parameter search space
param_space = {
    'n_estimators': tune.randint(100, 300),
    'max_depth': tune.randint(10, 30),
    'min_samples_split': tune.randint(2, 10),
    'min_samples_leaf': tune.randint(1, 4)
}

# Define the scheduler
pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="mean_accuracy",
    mode="max",
    perturbation_interval=5,
    hyperparam_mutations={
        "n_estimators": lambda: np.random.randint(100, 300),
        "max_depth": lambda: np.random.randint(10, 30),
        "min_samples_split": lambda: np.random.randint(2, 10),
        "min_samples_leaf": lambda: np.random.randint(1, 4),
    })

# Define the objective function
def train_model(config):
    # Get the data references from the object store
    X_train_combined_processed = ray.get(X_train_combined_ref)
    y_train_combined = ray.get(y_train_combined_ref)

    # Define the model pipeline with current config
    rf_model = RandomForestClassifier(**config, random_state=42)

    # Fit the model
    rf_model.fit(X_train_combined_processed, y_train_combined)

    # Evaluate on validation set
    y_val_pred = rf_model.predict(preprocessor.transform(X_val))
    accuracy = accuracy_score(y_val, y_val_pred)

    return {"mean_accuracy": accuracy}

# Perform Population Based Training
analysis = tune.run(
    train_model,
    config=param_space,
    scheduler=pbt_scheduler,
    stop={"training_iteration": 5},
    num_samples=10,
    resources_per_trial={"cpu": 2, "gpu": 0.5}  # Adjust based on your available resources
)

# Get the best performing model
best_trial = analysis.get_best_trial("mean_accuracy", "max", "last")
best_config = best_trial.config

# Define the best model
best_model = RandomForestClassifier(**best_config, random_state=42)

# Fit the best model on the combined training and validation set
best_model.fit(X_train_combined_processed, pd.concat([y_train, y_val]))

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_processed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

# Save the best model
model_save_path = '/content/drive/MyDrive/D2/D2_Random/pbt_rf_model.pkl'
dump(best_model, model_save_path)

2024-07-18 07:18:18,330	INFO worker.py:1621 -- Calling ray.init() again after it has already been called.


+--------------------------------------------------------------------+
| Configuration for experiment     train_model_2024-07-18_07-18-18   |
+--------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator             |
| Scheduler                        PopulationBasedTraining           |
| Number of trials                 10                                |
+--------------------------------------------------------------------+

View detailed results here: /root/ray_results/train_model_2024-07-18_07-18-18
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-07-18_07-11-13_646063_16086/artifacts/2024-07-18_07-18-18/train_model_2024-07-18_07-18-18/driver_artifacts`

Trial status: 10 PENDING
Current time: 2024-07-18 07:18:19. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------

2024-07-18 07:19:03,835	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_model_2024-07-18_07-18-18' in 0.0093s.



Trial train_model_e8294_00009 finished iteration 1 at 2024-07-18 07:19:03. Total running time: 45s
+--------------------------------------------------+
| Trial train_model_e8294_00009 result             |
+--------------------------------------------------+
| checkpoint_dir_name                              |
| time_this_iter_s                         0.46951 |
| time_total_s                             0.46951 |
| training_iteration                             1 |
| mean_accuracy                            0.54723 |
+--------------------------------------------------+

Trial train_model_e8294_00009 completed after 1 iterations at 2024-07-18 07:19:03. Total running time: 45s

Trial status: 10 TERMINATED
Current time: 2024-07-18 07:19:03. Total running time: 45s
Logical resource usage: 2.0/2 CPUs, 0.5/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------

['/content/drive/MyDrive/D2/D2_Random/pbt_rf_model.pkl']

# Genetic

In [None]:
!pip install deap

Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deap
Successfully installed deap-1.4.1


In [None]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>=1.4.1 (from tpot)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11938 sha256=537

In [None]:
!pip install --upgrade scikit-learn



In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tpot import TPOTClassifier
import joblib

# Load datasets
train_data = pd.read_csv('/content/drive/MyDrive/D2/training.csv')
val_data = pd.read_csv('/content/drive/MyDrive/D2/validation.csv')
test_data = pd.read_csv('/content/drive/MyDrive/D2/testmod.csv')

# Assume 'label' is the target and other columns are features
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Combine training and validation data
X_train_combined = pd.concat([X_train, X_val])
y_train_combined = pd.concat([y_train, y_val])

# Preprocessing pipeline
numeric_features = X_train_combined.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train_combined.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Create a pipeline with the preprocessor and a placeholder for the classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

# Define the TPOTClassifier for Genetic Algorithm optimization
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=42, config_dict={
    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
})

# Fit the TPOT classifier
tpot.fit(X_train_combined, y_train_combined)

# Get the best pipeline
best_pipeline = tpot.fitted_pipeline_

# Save the best model
model_save_path = '/content/drive/MyDrive/D2/D2_Random/genetic_rf_model.pkl'
joblib.dump(best_pipeline, model_save_path)

# Evaluate the best model on the test set
y_pred = best_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")


ImportError: cannot import name '_fit_context' from 'sklearn.base' (/usr/local/lib/python3.10/dist-packages/sklearn/base.py)

# Hyperband

In [None]:
!pip install hyperopt

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from ray import tune
from ray.tune.schedulers import HyperBandScheduler
from ray.tune.sklearn import TuneSearchCV
import joblib

# Load datasets
train_data = pd.read_csv('/content/drive/MyDrive/D2/training.csv')
val_data = pd.read_csv('/content/drive/MyDrive/D2/validation.csv')
test_data = pd.read_csv('/content/drive/MyDrive/D2/testmod.csv')

# Assume 'label' is the target and other columns are features
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Combine training and validation data
X_train_combined = pd.concat([X_train, X_val])
y_train_combined = pd.concat([y_train, y_val])

# Preprocessing pipeline
numeric_features = X_train_combined.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train_combined.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the parameter search space
param_space = {
    'classifier__n_estimators': tune.randint(100, 300),
    'classifier__max_depth': tune.randint(10, 30),
    'classifier__min_samples_split': tune.randint(2, 10),
    'classifier__min_samples_leaf': tune.randint(1, 4)
}

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Define the Hyperband scheduler
scheduler = HyperBandScheduler(max_t=50, grace_period=1)

# Initialize TuneSearchCV with Hyperband
tune_search = TuneSearchCV(
    estimator=pipeline,
    param_distributions=param_space,
    n_trials=30,
    early_stopping=True,
    max_iters=10,
    scoring='accuracy',
    n_jobs=-1,
    cv=3,
    verbose=2,
    scheduler=scheduler
)

# Fit the model
tune_search.fit(X_train_combined, y_train_combined)

# Save the best model
model_save_path = '/content/drive/MyDrive/D2/D2_Random/hyperband_rf_model.pkl'
joblib.dump(tune_search.best_estimator_, model_save_path)

# Evaluate the best model on the test set
y_pred = tune_search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

# BERT + GSCV

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from joblib import load

# Load the BERT model
bert_model_path = '/content/drive/MyDrive/D2/bert_model/bert_model.pth'
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.load_state_dict(torch.load(bert_model_path))

# Ensure BERT model is in evaluation mode
bert_model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the best Random Forest model
rf_model = load('/content/drive/MyDrive/D2/D2_Random/grid_rf_model.pkl')

# Function to get BERT predictions in batches
def get_bert_predictions_batch(model, data, batch_size=32):
    model.eval()
    predictions = []
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i+batch_size].tolist()
        inputs = tokenizer(batch_data, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(batch_predictions)
    return predictions

# Get BERT predictions for the test data
bert_predictions = get_bert_predictions_batch(bert_model, test_data['statement'])

# Get Random Forest predictions
X_test_tfidf = rf_model.named_steps['tfidf'].transform(X_test)
rf_predictions = rf_model.named_steps['rf'].predict(X_test_tfidf)

# Combine predictions (simple majority voting)
final_predictions = (bert_predictions + rf_predictions) / 2
final_predictions = (final_predictions > 0.5).astype(int)

# Evaluate the ensemble model
accuracy = accuracy_score(y_test, final_predictions)
print(f'Ensemble Model Accuracy: {accuracy}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ensemble Model Accuracy: 0.3898973954222573


# Testing with sampling

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from joblib import load
import numpy as np

# Load the BERT model
bert_model_path = '/content/drive/MyDrive/D2/bert_model/bert_model.pth'
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.load_state_dict(torch.load(bert_model_path))

# Ensure BERT model is in evaluation mode
bert_model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the best Random Forest model
rf_model = load('/content/drive/MyDrive/D2/D2_Random/grid_rf_model.pkl')

# Function to get BERT predictions in batches with subset sampling
def get_bert_predictions_batch(model, data, batch_size=32, subset_size=0.1):
    model.eval()
    predictions = []
    num_samples = int(len(data) * subset_size)
    indices = np.random.choice(len(data), num_samples, replace=False)
    sampled_data = data.iloc[indices]
    for i in range(0, len(sampled_data), batch_size):
        batch_data = sampled_data[i:i+batch_size].tolist()
        inputs = tokenizer(batch_data, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(batch_predictions)
    return np.array(predictions), indices

# Get BERT predictions for the test data
bert_predictions, sampled_indices = get_bert_predictions_batch(bert_model, test_data['statement'])

# Get Random Forest predictions for the same subset
X_test_sampled = rf_model.named_steps['tfidf'].transform(X_test.iloc[sampled_indices])
rf_predictions = rf_model.named_steps['rf'].predict(X_test_sampled)

# Combine predictions (simple majority voting)
final_predictions = (bert_predictions + rf_predictions) / 2
final_predictions = (final_predictions > 0.5).astype(int)

# Evaluate the ensemble model
accuracy = accuracy_score(y_test.iloc[sampled_indices], final_predictions)
print(f'Ensemble Model Accuracy: {accuracy}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ensemble Model Accuracy: 0.3888888888888889
