In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# --- 1. Load and Prepare the Data ---
df = pd.read_csv('../../data/tweets.csv.gz', compression="gzip")
# Keep only the relevant columns and drop rows with missing text
df = df[['airline_sentiment', 'text']].dropna()

In [3]:
# We will perform multiclass classification (positive, negative, neutral)
# Map the sentiment labels to numerical values for the model
df["airline_sentiment_encoded"] = df["airline_sentiment"].map({"negative": 0, "neutral": 1, "positive": 2})

In [4]:
# Select the features (X) and target variable (y)
X = df['text']
y = df['airline_sentiment_encoded'] # Use the encoded labels

In [5]:
# --- 2. Split Data for Training and Testing ---
# Split the data into training and testing sets.
# Using stratify=y ensures the proportion of each sentiment is the same in both sets.
# The random_state parameter ensures that the data is split in the same way every time.

from sklearn.model_selection import train_test_split

# Split before SMOTE
X_train, X_temp, y_train, y_temp = train_test_split(X, y, 
                                        stratify=y,            
                                        train_size=0.7, 
                                        random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, 
                                        stratify=y_temp,            
                                        train_size=0.5, 
                                        random_state=42)

In [6]:
# --- 3. Generate SBERT Embeddings for train test validation split ---
# We will use a pre-trained SBERT model to convert the tweet text into numerical vectors.
import os
import pandas as pd
from sentence_transformers import SentenceTransformer

EMBEDDING_TRAIN = "../../data/sbert_embedding_train.csv.gz"

## Load sbert embedding for train set
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_train_sbert_embedding = sbert_model.encode(X_train.tolist(), show_progress_bar=True)

## Load sbert embedding for validation set
X_valid_sbert_embedding = sbert_model.encode(X_valid.tolist(), show_progress_bar=True)    

## Load sbert embedding for test validation set
X_test_sbert_embedding = sbert_model.encode(X_test.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/321 [00:00<?, ?it/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

In [7]:
# ---- 3. Handling class imbalance issue with SMOTE --- 
from imblearn.over_sampling import SMOTE

# Handling imbalanced using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sbert_embedding, y_train)

In [8]:
y_train_resampled.shape
X_train_resampled.shape

(19275, 384)

In [None]:
# --- 4. Hyperparameter Tuning with GridSearchCV ---
print("\nStarting Hyperparameter Tuning...")

# Define the XGBoost classifier.
# We set a random_state for reproducibility of the model's internal processes.
# The 'objective' is set to 'multi:softprob' for multi-class classification.
# We also set 'use_label_encoder=False' to avoid a deprecation warning.
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42 # Set random_state for reproducibility
)

xgb_clf = XGBClassifier(max_depth=10,
                            random_state=42,
                            # Introduce randomness to make training faster and reduce overfitting
                            subsample=0.8, ## Uses 80% of the data for each tree.
                            colsample_bytree=0.8, ## Uses 80% of the features for each tree.
                            # the parameters below make the model trained faster by enabling parallelism
                            n_jobs = -1)

# Define the parameter grid to search.
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up GridSearchCV to find the best parameters.
# cv=3 specifies 3-fold cross-validation.
# n_jobs=-1 will use all available CPU cores to speed up the process.
# verbose=2 will print progress updates.
# Note: The cross-validation splitting in GridSearchCV is also a random process.
# By default, it uses StratifiedKFold for classifiers, which has a shuffle=True default.
# While not strictly necessary if train_test_split is seeded, it's good practice to also control this.
from sklearn.model_selection import StratifiedKFold
cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='accuracy',
    # cv=cv_strategy, # Use the defined cross-validation strategy
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV to the training data. This will train and evaluate the model with all parameter combinations.
grid_search.fit(X_train_resampled, y_train_resampled)



Starting Hyperparameter Tuning...
Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:

# --- 5. Evaluate the Best Model ---
print("\nHyperparameter tuning complete.")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Get the best model found by GridSearchCV
best_model = grid_search.best_estimator_





In [None]:
# Make predictions on the test set
# For 'multi:softprob', predict_proba() gives probabilities for each class.
# We use np.argmax to get the class with the highest probability.

### (A) Evaluate the model's performance for Valid set
y_valid_proba = best_model.predict_proba(X_valid)
y_valid = np.argmax(y_valid_proba, axis=1)

print("\n--- Final Model Performance on Test Set ---")
print(f"Accuracy: {accuracy_score(y_test, y_valid):.4f}")
print("\nClassification Report (Validation):")
print(classification_report(y_test, y_valid))

### (B) Evaluate the model's performance for Test set
y_pred_proba = best_model.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

print("\n--- Final Model Performance on Test Set ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred))