# Experiment (4) XGBoost with HyperParam Tuning

Configuration

In [9]:
import mlflow
import os

# Set DagsHub credentials (you'll need to set these environment variables)
os.environ["MLFLOW_TRACKING_USERNAME"] = (
    "yahiaehab10"  # Replace with your DagsHub username
)
os.environ["MLFLOW_TRACKING_PASSWORD"] = (
    "b1e53999e80aecdcf2b9a9e6d0e2fea6ae02fa4c"  # Replace with your DagsHub token
)

# Set the correct MLflow tracking URI for DagsHub
mlflow.set_tracking_uri(
    "https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow"
)

# Set the experiment name
mlflow.set_experiment("XGBoost with HyperParam Tuning")

<Experiment: artifact_location='mlflow-artifacts:/67446475e5ed4f5caace8a4ebe9e8517', creation_time=1758369371939, experiment_id='4', last_update_time=1758369371939, lifecycle_stage='active', name='XGBoost with HyperParam Tuning', tags={}>

## Experiment

In [10]:

import optuna

import mlflow
import mlflow.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [11]:
df = pd.read_csv("processed_reddit_data.csv").dropna(subset=["clean_comment"])
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [14]:
# 1. Remap the class labels
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})
    
# 2. Remove rows where targets are NaN
df = df.dropna(subset=['category'])

# 3. Define n-gram range and max features
ngram_range = (1, 3)
max_features = 10000

# 4. Train test split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# 5. Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 6. Initialize SMOTE
smote = SMOTE(random_state=42)
X_train_vec, y_train = smote.fit_resample(X_train_tfidf, y_train)

# 7. Log to MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        # Set tags
        mlflow.set_tags("mlflow.runName", model_name)
        mlflow.set_tags("experiment", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algorithm", model_name)

        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if label not in ['accuracy', 'macro avg', 'weighted avg'] and isinstance(metrics, dict):
                for metric_name, metric_value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", metric_value)

        # Log model
        mlflow.sklearn.log_model(model, "model")


# 8. Optuna objective function
def objective_xgboost(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    max_depth = trial.suggest_int("max_depth", 3, 10)

    model = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
    )
    return accuracy_score(y_test, model.fit(X_train_vec, y_train).predict(X_test_tfidf))

# 9. Run Optuna study
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_xgboost, n_trials=20)

    print("Best trial:")
    trial = study.best_trial
    print(f"  Value: {trial.value}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Log the best model to MLflow
    best_model = XGBClassifier(
        n_estimators=trial.params["n_estimators"],
        learning_rate=trial.params["learning_rate"],
        max_depth=trial.params["max_depth"],
    )
    log_mlflow("XGBoost_Optuna", best_model, X_train_vec, X_test_tfidf, y_train, y_test)

run_optuna_experiment()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})
[I 2025-09-20 15:11:44,586] A new study created in memory with name: no-name-528cdbfb-2c9d-40be-8c05-3954a0bbe1ca
[I 2025-09-20 15:11:47,769] Trial 0 finished with value: 0.9139387539598732 and parameters: {'n_estimators': 195, 'learning_rate': 0.25879315045514734, 'max_depth': 5}. Best is trial 0 with value: 0.9139387539598732.
[I 2025-09-20 15:11:49,951] Trial 1 finished with value: 0.7247448081661387 and parameters: {'n_estimators': 64, 'learning_rate': 0.01200191778566759, 'max_depth': 5}. Best is trial 0 with value: 0.9139387539598732.
[I 2025-09-20 15:11:58,582] Trial 2 finished with value: 0.8864836325237593 and parameters: {'n_estimators': 242, 'learning_rate': 

Best trial:
  Value: 0.9376979936642027
  Params: 
    n_estimators: 298
    learning_rate: 0.2546238945992379
    max_depth: 9
🏃 View run XGBoost_Optuna at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/4/runs/d0044864d80841ca8c076a9ed07eb6aa
🧪 View experiment at: https://dagshub.com/yahiaehab10/end-to-end-sentiment-analysis.mlflow/#/experiments/4


AttributeError: 'str' object has no attribute 'items'