<a href="https://colab.research.google.com/github/veradureke/Technology-Product-Categorization/blob/main/Supervised_Learning_(XGboost).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')
categories_file_path = '/content/drive/MyDrive/Verizon 1 2024-2025/categories.csv'
products_file_path = '/content/drive/MyDrive/Verizon 1 2024-2025/products.csv'
sub_categories_file_path = '/content/drive/MyDrive/Verizon 1 2024-2025/sub-categories.csv'

# Load CSV files
categories_df = pd.read_csv(categories_file_path)
products_df = pd.read_csv(products_file_path)
sub_categories_df = pd.read_csv(sub_categories_file_path)

# Data preparation
X = products_df['product_description']
y = products_df['taxonomy_category']

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Calculate class weights for balancing
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_encoded),
    y=y_train_encoded
)
class_weight_dict = dict(enumerate(class_weights))

# Train XGBoost model
xgb_model = XGBClassifier(eval_metric='mlogloss', scale_pos_weight=1)
xgb_model.fit(
    X_train_tfidf,
    y_train_encoded,
    sample_weight=np.array([class_weight_dict[label] for label in y_train_encoded])
)

# Evaluation
y_pred_encoded = xgb_model.predict(X_test_tfidf)
xgb_report = classification_report(
    y_test_encoded,
    y_pred_encoded,
    target_names=label_encoder.classes_,
    zero_division=0
)

print("XGBoost Classification Report (Adjusted):")
print(xgb_report)


Mounted at /content/drive


Parameters: { "scale_pos_weight" } are not used.



XGBoost Classification Report (Adjusted):
                                      precision    recall  f1-score   support

                             AI & ML       0.00      0.00      0.00         6
                 BI & Data Analytics       0.32      0.48      0.39        23
      Customer Operational Platforms       0.46      0.43      0.45        30
      Design, Development & Delivery       0.55      0.51      0.53        35
                   Digital Workplace       0.29      0.31      0.30        13
               Emerging Technologies       0.00      0.00      0.00         1
                Enterprise Platforms       0.47      0.56      0.51        48
Geographic Information Systems (GIS)       0.83      0.50      0.62        10
 IT Infrastructure Software Services       0.51      0.48      0.50        56
             Management & Governance       0.43      0.23      0.30        13
                Marketing Management       0.60      0.50      0.55        18
   Performance, Monit

In [None]:
# Function to make predictions for a description
def predict_category(sample_description):
    """
    Predict the category of a given product description.

    Args:
        sample_description (str): The product description to categorize.

    Returns:
        str: Predicted category.
    """
    # Transform the input description using the trained TF-IDF vectorizer
    sample_tfidf = vectorizer.transform([sample_description])

    # Predict the encoded label using the trained XGBoost model
    predicted_encoded = xgb_model.predict(sample_tfidf)

    # Decode the predicted label back to the original category
    predicted_category = label_encoder.inverse_transform(predicted_encoded)

    return predicted_category[0]

# Example: Predicting for a description
sample_description = (
    "A voice, video, and text communication platform designed for creating communities. "
    "Offers real-time messaging, screen sharing, and integrations with gaming and productivity tools."
)
predicted_category = predict_category(sample_description)

print(f"Sample Description: {sample_description}")
print(f"Predicted Category: {predicted_category}")

NameError: name 'vectorizer' is not defined