In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle

In [4]:
df = pd.read_csv("supply_chain_deliveries.csv")

In [5]:
df.head()

Unnamed: 0,WorkDate,Customer,Location,BusinessType,OrderCount,NumberOfPieces,TotalRevenue
0,2020-01-02,Amazon,Chicago,Final Mile,38,190,2084.09
1,2020-01-02,Home Depot,Sacramento,Final Mile,34,136,6153.01
2,2020-01-02,Home Depot,Chicago,Final Mile,43,215,15691.72
3,2020-01-02,Home Depot,Detroit,Final Mile,41,164,6490.39
4,2020-01-02,Home Depot,Atlanta,Final Mile,44,220,10069.65


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126255 entries, 0 to 126254
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   WorkDate        126255 non-null  object 
 1   Customer        126255 non-null  object 
 2   Location        126255 non-null  object 
 3   BusinessType    126255 non-null  object 
 4   OrderCount      126255 non-null  int64  
 5   NumberOfPieces  126255 non-null  int64  
 6   TotalRevenue    126255 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 6.7+ MB


In [7]:
df.describe()

Unnamed: 0,OrderCount,NumberOfPieces,TotalRevenue
count,126255.0,126255.0,126255.0
mean,27.487759,137.447871,2619.589379
std,23.390942,119.446285,3732.441225
min,1.0,3.0,26.02
25%,11.0,52.0,688.61
50%,20.0,100.0,1450.79
75%,37.0,185.0,2998.13
max,149.0,1015.0,64318.07


In [8]:
df.isnull().sum()

WorkDate          0
Customer          0
Location          0
BusinessType      0
OrderCount        0
NumberOfPieces    0
TotalRevenue      0
dtype: int64

In [41]:
# Load and prepare data
df = pd.read_csv("supply_chain_deliveries.csv")


In [42]:
df = df.drop(columns=["WorkDate"])  # Drop date column
X = df.drop("TotalRevenue", axis=1)
y = df["TotalRevenue"]

In [43]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
# Define preprocessing
categorical_cols = ['Customer', 'Location', 'BusinessType']
numeric_cols = ['OrderCount', 'NumberOfPieces']

In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

In [46]:
# Define models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [47]:
# Evaluate models
results = []

for name, model in models.items():
    pipeline = make_pipeline(preprocessor, model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name, "MAE": mae, "R2 Score": r2})

results_df = pd.DataFrame(results).sort_values(by="R2 Score", ascending=False)
print(results_df)


               Model          MAE  R2 Score
1      Decision Tree     4.432243  0.999446
2  Gradient Boosting   666.113182  0.909974
0  Linear Regression  1305.259098  0.659993


In [57]:
# Find best model by highest R2 Score
best_model_row = results_df.loc[results_df["R2 Score"].idxmax()]# Extract details
best_model_name = best_model_row["Model"]
best_model_mae = best_model_row["MAE"]
best_model_r2 = best_model_row["R2 Score"]
best_model = pipeline
# Print results
print(f"Best Model: {best_model_name}")
print(f"Mean Absolute Error (MAE): {best_model_mae:.2f}")
print(f"R-squared Score (R²): {best_model_r2:.4f}")

Best Model: Decision Tree
Mean Absolute Error (MAE): 4.43
R-squared Score (R²): 0.9994


In [59]:
# Save the best model as a pickle file
with open('revenue_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)