## Hyperparameter tuning on XGBoost & Logistics Regression on LLM embedding

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import (f1_score, accuracy_score, precision_score,
                             recall_score, classification_report)

In [2]:
# --- 1. Load and Prepare the Data ---
df = pd.read_csv('../../data/tweets.csv.gz', compression="gzip")
# Keep only the relevant columns and drop rows with missing text
df = df[['airline_sentiment', 'text']].dropna()

In [3]:
# We will perform multiclass classification (positive, negative, neutral)
# Map the sentiment labels to numerical values for the model
df["airline_sentiment_encoded"] = df["airline_sentiment"].map({"negative": 0, "neutral": 1, "positive": 2})

In [4]:
# Select the features (X) and target variable (y)
X = df['text']
y = df['airline_sentiment_encoded'] # Use the encoded labels

In [5]:
# --- 2. Split Data for Training and Testing ---
# Split the data into training and testing sets.
# Using stratify=y ensures the proportion of each sentiment is the same in both sets.
# The random_state parameter ensures that the data is split in the same way every time.

from sklearn.model_selection import train_test_split

# Split before SMOTE
X_train, X_temp, y_train, y_temp = train_test_split(X, y, 
                                        stratify=y,            
                                        train_size=0.7, 
                                        random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, 
                                        stratify=y_temp,            
                                        train_size=0.5,
                                        random_state=42)

In [None]:
# --- 3. Generate SBERT Embeddings for train test validation split ---
# We will use a pre-trained SBERT model to convert the tweet text into numerical vectors.

import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

# generate llm embedding for train data (or load existing one if exists)
EMBEDDING_TRAIN = "../../data/hyperparameter_tuning_llm_embedding_train.csv.gz"
if os.path.exists(EMBEDDING_TRAIN):
    pass
else:
    processed_text_series = pd.Series(X_train.to_list(),
                                    index=X_train.index.to_list()) 
    llm_embedding_train = generate_embeddings_from_series(processed_text_series,
                            additional_data={"encoded_sentiment": y_train.to_list()},
                            output_csv_path="../../data/hyperparameter_tuning_llm_embedding_train.csv.gz",
                            max_workers=20)
    print(llm_embedding_train)

# generate llm embedding for valid data (or load existing one if exists)
EMBEDDING_VALID = "../../data/hyperparamter_tuning_llm_embedding_valid.csv.gz"
if os.path.exists(EMBEDDING_VALID):
    pass
else:
    processed_text_series = pd.Series(X_valid.to_list(), 
                                    index=X_valid.index.to_list()) 
    llm_embedding_valid = generate_embeddings_from_series(processed_text_series,
                            additional_data={"encoded_sentiment": y_valid.to_list()},
                            output_csv_path="../../data/hyperparamter_tuning_llm_embedding_valid.csv.gz",
                            max_workers=20) 
    print(llm_embedding_valid)
    
# generate llm embedding for test data (or load existing one if exists)
EMBEDDING_TEST = "../../data/hyperparamter_tuning_llm_embedding_test.csv.gz"
if os.path.exists(EMBEDDING_TEST):
    pass
else:
    processed_text_series = pd.Series(X_test.to_list(), 
                                    index=X_test.index.to_list()) 
    llm_embedding_test = generate_embeddings_from_series(processed_text_series,
                            additional_data={"encoded_sentiment": y_test.to_list()},
                            output_csv_path="../../data/hyperparamter_tuning_llm_embedding_test.csv.gz",
                            max_workers=20)
    print(llm_embedding_test)

In [7]:
import numpy
import json

train_vectorized = pd.read_csv("../../data/hyperparameter_tuning_llm_embedding_train.csv.gz", compression="gzip")
valid_vectorized = pd.read_csv("../../data/hyperparamter_tuning_llm_embedding_valid.csv.gz", compression="gzip")
test_vectorized = pd.read_csv("../../data/hyperparamter_tuning_llm_embedding_test.csv.gz", compression="gzip")

X_train_vectorized = train_vectorized["embedding_json"].apply(json.loads) # convert string into a list of 765 items in 1 column
X_train_vectorized = numpy.vstack(X_train_vectorized) # turn that list of 765 items into 765 features / columns
y_train = train_vectorized["encoded_sentiment"]

X_valid_vectorized = valid_vectorized["embedding_json"].apply(json.loads)
X_valid_vectorized = numpy.vstack(X_valid_vectorized)

X_test_vectorized = test_vectorized["embedding_json"].apply(json.loads)
X_test_vectorized = numpy.vstack(X_test_vectorized)

y_valid = valid_vectorized["encoded_sentiment"]
y_test = test_vectorized["encoded_sentiment"]

In [8]:
# ---- 3. Handling class imbalance issue with SMOTE --- 
from imblearn.over_sampling import SMOTE

# Handling imbalanced using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)

In [9]:
# --- 4. Hyperparameter Tuning with RandomizedSearchCV ---
print("\nStarting Hyperparameter Tuning...")

import joblib
# Define the XGBoost classifier.
xgb_clf = XGBClassifier(max_depth=10,
                            random_state=42,
                            # Introduce randomness to make training faster and reduce overfitting
                            subsample=0.8, ## Uses 80% of the data for each tree.
                            colsample_bytree=0.8, ## Uses 80% of the features for each tree.
                            # the parameters below make the model trained faster by enabling parallelism
                            n_jobs = -1)

# Define the parameter grid to search.
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up RandomizedSearchCV to find the best parameters.
# cv=3 specifies 3-fold cross-validation.
# n_jobs=-1 will use all available CPU cores to speed up the process.
# verbose=2 will print progress updates.
# Note: The cross-validation splitting in RandomizedSearchCV is also a random process.
# By default, it uses StratifiedKFold for classifiers, which has a shuffle=True default.
# While not strictly necessary if train_test_split is seeded, it's good practice to also control this.
from sklearn.model_selection import StratifiedKFold
cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    scoring='accuracy',
    cv=cv_strategy, # Use the defined cross-validation strategy
    verbose=2,
    n_jobs=-1
)

# Fit RandomizedSearchCV to the training data. This will train and evaluate the model with random parameter combinations.
grid_search.fit(X_train_resampled, y_train_resampled)



Starting Hyperparameter Tuning...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [10]:
# --- 5. Evaluate the Best Model ---
print("\nHyperparameter tuning complete.")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Get the best model found by RandomizedSearchCV
best_model = grid_search.best_estimator_


Hyperparameter tuning complete.
Best parameters found: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.2, 'colsample_bytree': 1.0}
Best cross-validation accuracy: 0.9336


In [11]:
# Make predictions on the test set
# predict_proba() gives probabilities for each class.
# We use np.argmax to get the class with the highest probability.

### (A) Evaluate the model's performance for Valid set
y_valid_proba_pred = best_model.predict_proba(X_valid_vectorized)
y_valid_pred = np.argmax(y_valid_proba_pred, axis=1)

print("\n--- Final Model Performance on Validation Set ---")

print(f"Validation Accuracy: {accuracy_score(y_valid, y_valid_pred):.4f}")
print(f"Validation F1-score: {f1_score(y_valid, y_valid_pred, average='weighted'):.4f}")
print(f"Validation Precision:  {precision_score(y_valid, y_valid_pred, average='weighted'):.4f}")
print(f"Validation Recall:  {recall_score(y_valid, y_valid_pred, average='weighted'):.4f}\n")

print("\nClassification Report (Validation):")
print(classification_report(y_valid, y_valid_pred))


--- Final Model Performance on Validation Set ---
Validation Accuracy: 0.8520
Validation F1-score: 0.8529
Validation Precision:  0.8541
Validation Recall:  0.8520


Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      1376
           1       0.69      0.73      0.71       465
           2       0.81      0.79      0.80       355

    accuracy                           0.85      2196
   macro avg       0.81      0.81      0.81      2196
weighted avg       0.85      0.85      0.85      2196



In [12]:
### (B) Evaluate the model's performance for Test Set
y_test_proba_pred = best_model.predict_proba(X_test_vectorized)
y_test_pred = np.argmax(y_test_proba_pred, axis=1)

print("\n--- Final Model Performance on Test Set ---")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Test F1-score: {f1_score(y_test, y_test_pred, average='weighted')}")
print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Test Recall:  {recall_score(y_test, y_test_pred, average='weighted'):.4f}\n")

print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))


--- Final Model Performance on Test Set ---
Test Accuracy: 0.8552
Test F1-score: 0.8565059466421867
Test Precision: 0.8585
Test Recall:  0.8552


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1377
           1       0.69      0.75      0.72       465
           2       0.82      0.78      0.80       354

    accuracy                           0.86      2196
   macro avg       0.81      0.81      0.81      2196
weighted avg       0.86      0.86      0.86      2196

