# Experiment 4 Handling Imbalanced Data

Configurations

In [1]:
import mlflow
import os

# Set DagsHub credentials (you'll need to set these environment variables)
os.environ["MLFLOW_TRACKING_USERNAME"] = (
    "yahiaehab10"  # Replace with your DagsHub username
)
os.environ["MLFLOW_TRACKING_PASSWORD"] = (
    "b1e53999e80aecdcf2b9a9e6d0e2fea6ae02fa4c"  # Replace with your DagsHub token
)

# Set the correct MLflow tracking URI for DagsHub
mlflow.set_tracking_uri(
    "https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow"
)

# Set the experiment name
mlflow.set_experiment("Handling Imbalanced Data")

2025/09/20 14:27:32 INFO mlflow.tracking.fluent: Experiment with name 'Handling Imbalanced Data' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/239177cda3e940598b5be08c122452da', creation_time=1758367652296, experiment_id='3', last_update_time=1758367652296, lifecycle_stage='active', name='Handling Imbalanced Data', tags={}>

## Experiment

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os

import mlflow.sklearn
import mlflow

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [3]:
df = pd.read_csv("processed_reddit_data.csv").dropna(subset=["clean_comment"])
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [5]:
def run_imbalanced_experiment(imbalance_method):
    # Configurations
    ngrams = (1, 3)
    max_features = 10000

    # Vectorization
    vectorizer = TfidfVectorizer(ngram_range=ngrams, max_features=max_features)
    X = vectorizer.fit_transform(df["clean_comment"]).toarray()
    y = df["category"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Handle imbalance
    if imbalance_method == "class_weight":
        class_weights = "balanced"
    else:
        class_weights = None
        if imbalance_method == "smote":
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        elif imbalance_method == "adasyn":
            adasyn = ADASYN(random_state=42)
            X_train, y_train = adasyn.fit_resample(X_train, y_train)
        elif imbalance_method == "undersample":
            undersampler = RandomUnderSampler(random_state=42)
            X_train, y_train = undersampler.fit_resample(X_train, y_train)
        elif imbalance_method == "smoteenn":
            smoteenn = SMOTEENN(random_state=42)
            X_train, y_train = smoteenn.fit_resample(X_train, y_train)

    # Define and train the model
    with mlflow.start_run():
        # Set tags
        mlflow.set_tag("mlflow.runName", f"Imbalance Method: {imbalance_method}")
        mlflow.set_tag("experiment_type", "Handling Imbalanced Data")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        mlflow.set_tag(
            "description", "Handling class imbalance using various techniques"
        )

        # Log parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngrams)
        mlflow.log_param("max_features", max_features)

        # Log RF parameters
        n_estimators = 100
        max_depth = 20
        random_state = 42

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("random_state", random_state)
        mlflow.log_param("imbalance_method", imbalance_method)

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=random_state,
            class_weight=class_weights,
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)

        # Log Classification Report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if label not in ["accuracy", "macro avg", "weighted avg"]:
                for metric_name, metric_value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", metric_value)
        mlflow.log_metric("accuracy_report", classification_rep["accuracy"])

        # Log Confusion Matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix")
        cm_path = f"confusion_matrix_{imbalance_method}.png"
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path)
        os.remove(cm_path)

        # Log the model
        mlflow.sklearn.log_model(model, "random_forest_model")


# Run experiments with different imbalance handling methods
imbalance_methods = [
    "none",
    "class_weight",
    "smote",
    "adasyn",
    "undersample",
    "smoteenn",
]
for method in imbalance_methods:
    run_imbalanced_experiment(method)



🏃 View run Imbalance Method: none at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3/runs/80d19457e683419da6abcd3403303c7c
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3




🏃 View run Imbalance Method: class_weight at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3/runs/9ba7e38a1af94dcc9e2ff7770a32ca1a
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3




🏃 View run Imbalance Method: smote at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3/runs/4dcb123ffbb24a24ba9e53fa3e07acaf
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3




🏃 View run Imbalance Method: adasyn at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3/runs/c5955411026747f38edaeef3d3a068e9
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3




🏃 View run Imbalance Method: undersample at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3/runs/8247e6471ded400885b6ac636be9db30
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3




🏃 View run Imbalance Method: smoteenn at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3/runs/d12a16faa5ce4d5ab5692a342d529ddc
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/3
