In [None]:
# Example of XGBoost ML program designed with Amazon Q Developer prompts
# 2025-03-13

"""
Initial prompt used in Amazon Q Developer:

Please write a Python XGBoost
program to predict order count by
sku_description. Please also use the
xgbfir library to identify feature
importance, and rank the importance
of 3-deep interactions among the
predictor variables.
"""


import os
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgbfir
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data_folder = "/My/Local/Path/To/Data"  # add your link here
data_path = os.path.join(data_folder, 'demo_data.csv')  # bring in your data here
df = pd.read_csv(data_path, sep=',')
df.head(2)  # Uses prejoined data from the March 13th AWS seminar

Unnamed: 0,sku_description,sku_vendor,sku_category,price_per_item_usd,order_month,order_day_of_week,shipper_id,order_count,avg_quantity_per_order
0,Wobble Wand v15,Wo,Wobbl,34,10,3,1,1,9.0
1,Five Star Frame,Fi,Five,25,2,5,5,2,2.5


In [None]:

df['dt_order_placed'] = pd.to_datetime(df.date_order_placed)
df['dt_sku_created'] = pd.to_datetime(df.date_sku_created)
df['dt_sku_updated'] = pd.to_datetime(df.date_sku_updated)
df['order_month'] = df.dt_order_placed.dt.month
df['order_day_of_week'] = df.dt_order_placed.dt.dayofweek
df['sales_amount'] = df.quantity_ordered * df.price_per_item_usd
keep_cols = ['sku_description','sku_vendor','sku_category','price_per_item_usd','quantity_ordered',
             'order_month','order_day_of_week','shipper_id','sales_amount']
df = df[keep_cols]
df.iloc[0]

sku_description           Wobble Wand v15
sku_vendor                             Wo
sku_category                        Wobbl
price_per_item_usd                     34
order_month                            10
order_day_of_week                       3
shipper_id                              1
order_count                             1
avg_quantity_per_order                9.0
Name: 0, dtype: object

# Amazon Q Developer XGBoost Demo

In [27]:
df.head(5)

Unnamed: 0,sku_description,sku_vendor,sku_category,price_per_item_usd,order_month,order_day_of_week,shipper_id,order_count,avg_quantity_per_order
0,Wobble Wand v15,Wo,Wobbl,34,10,3,1,1,9.0
1,Five Star Frame,Fi,Five,25,2,5,5,2,2.5
2,Squishinator Extra v21,Sq,Squis,35,1,5,5,2,7.0
3,Five Star Frame,Fi,Five,25,7,1,5,8,4.875
4,Lightup Wand v25,Li,Light,18,10,3,1,2,3.5


In [28]:
# Prepare features
label_encoders = {}
categorical_columns = ['sku_description', 'sku_vendor', 'sku_category']

# Label encode categorical variables
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [None]:
# Prepare features and target
feature_columns = [
    'sku_description', 'sku_vendor', 'sku_category',
    'price_per_item_usd', 'order_month', 'order_day_of_week', 
    'shipper_id', 'avg_quantity_per_order'
]

X = df[feature_columns]
y = df['order_count']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create and train XGBoost model
model = xgb.XGBRegressor(
    objective='count:poisson',  # Changed to count objective
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    #early_stopping_rounds=10,
    verbose=True
)

[0]	validation_0-poisson-nloglik:1.86495
[1]	validation_0-poisson-nloglik:1.82248
[2]	validation_0-poisson-nloglik:1.78312
[3]	validation_0-poisson-nloglik:1.74615
[4]	validation_0-poisson-nloglik:1.71138
[5]	validation_0-poisson-nloglik:1.68062
[6]	validation_0-poisson-nloglik:1.65436
[7]	validation_0-poisson-nloglik:1.63161
[8]	validation_0-poisson-nloglik:1.61117
[9]	validation_0-poisson-nloglik:1.59349
[10]	validation_0-poisson-nloglik:1.57713
[11]	validation_0-poisson-nloglik:1.56299
[12]	validation_0-poisson-nloglik:1.55055
[13]	validation_0-poisson-nloglik:1.53916
[14]	validation_0-poisson-nloglik:1.52960
[15]	validation_0-poisson-nloglik:1.52096
[16]	validation_0-poisson-nloglik:1.51317
[17]	validation_0-poisson-nloglik:1.50642
[18]	validation_0-poisson-nloglik:1.50054
[19]	validation_0-poisson-nloglik:1.49541
[20]	validation_0-poisson-nloglik:1.48987
[21]	validation_0-poisson-nloglik:1.48535
[22]	validation_0-poisson-nloglik:1.48080
[23]	validation_0-poisson-nloglik:1.47746
[2

In [31]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Mean Squared Error: 1.21
R-squared Score: 0.67


In [32]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))


Feature Importance:
                  feature  importance
0         sku_description    0.598753
3      price_per_item_usd    0.215168
4             order_month    0.079655
7  avg_quantity_per_order    0.076428
5       order_day_of_week    0.016652
6              shipper_id    0.013344
1              sku_vendor    0.000000
2            sku_category    0.000000


In [33]:
# Generate XGBFIR report for interaction analysis
target_path = os.path.join(data_folder, 'xgb_interactions.xlsx')

xgbfir.saveXgbFI(
    model,
    feature_names=X.columns.tolist(),
    OutputXlsxFile=target_path
)

# Print top 10 three-way interactions
interactions_df = pd.read_excel(target_path, sheet_name='Interaction Depth 2')
print("\nTop 10 Three-way Feature Interactions:")
print(interactions_df.nlargest(10, 'Gain'))


Top 10 Three-way Feature Interactions:
                                         Interaction          Gain  FScore  \
0  price_per_item_usd|sku_description|sku_descrip...  26303.354662     118   
1     order_month|price_per_item_usd|sku_description  10610.637580     117   
2  avg_quantity_per_order|sku_description|sku_des...   7217.076748      80   
3  avg_quantity_per_order|price_per_item_usd|sku_...   6698.371967     105   
4  avg_quantity_per_order|avg_quantity_per_order|...   5741.063929     512   
5        order_month|sku_description|sku_description   5159.837191      77   
6  avg_quantity_per_order|avg_quantity_per_order|...   5118.181117     179   
7  avg_quantity_per_order|order_month|sku_descrip...   4468.182539      93   
8  avg_quantity_per_order|order_day_of_week|sku_d...   2782.278177      77   
9  avg_quantity_per_order|order_month|price_per_i...   2617.044036      74   

      wFScore  Average wFScore  Average Gain  Expected Gain  Gain Rank  \
0   22.291131         0.188

In [15]:
# Function to make predictions for new SKUs
def predict_sales(new_data):
    processed_data = new_data.copy()
    
    # Handle categorical variables
    for col in categorical_columns:
        if col in processed_data:
            # If the category is unknown, assign a default value
            try:
                processed_data[col] = label_encoders[col].transform(processed_data[col])
            except ValueError:
                # Assign the most frequent category (mode) from training data
                default_category = label_encoders[col].transform([df[col].mode()[0]])[0]
                processed_data[col] = default_category
                print(f"Warning: Unknown category in {col}. Using default category.")
    
    return model.predict(processed_data)

In [16]:
# Example usage with actual categories from your data
# First, let's print some example categories from the training data
print("\nExample categories from training data:")
for col in categorical_columns:
    print(f"\n{col} examples:")
    original_categories = label_encoders[col].inverse_transform(range(min(5, len(label_encoders[col].classes_))))
    print(original_categories)


Example categories from training data:

sku_description examples:
['Awesome Product v0' 'Boing Booster v4' 'Coffee Mug' 'Desktop Album'
 'Doodle Dazzler v5']

sku_vendor examples:
['Aw' 'Bo' 'Co' 'De' 'Do']

sku_category examples:
['Aweso' 'Boing' 'Coffe' 'Deskt' 'Doodl']


In [17]:


# Now create sample data using actual categories from the training data
sample_data = pd.DataFrame({
    'sku_description': [label_encoders['sku_description'].inverse_transform([0])[0]],
    'sku_vendor': [label_encoders['sku_vendor'].inverse_transform([0])[0]],
    'sku_category': [label_encoders['sku_category'].inverse_transform([0])[0]],
    'price_per_item_usd': [38.0],
    'order_month': [1],
    'order_day_of_week': [1],
    'shipper_id': [1]
})

print("\nMaking prediction with sample data:")
print("Sample data categories:", sample_data[categorical_columns].values.tolist())
predicted_sales = predict_sales(sample_data)
print(f"Predicted sales for sample SKU: ${predicted_sales[0]:.2f}")


Making prediction with sample data:
Sample data categories: [['Awesome Product v0', 'Aw', 'Aweso']]
Predicted sales for sample SKU: $248.84
