In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import joblib

In [3]:
df = pd.read_csv("supply_chain_deliveries.csv")

In [4]:
df.head()

Unnamed: 0,WorkDate,Customer,Location,BusinessType,OrderCount,NumberOfPieces,TotalRevenue
0,2020-01-02,Amazon,Chicago,Final Mile,38,190,2084.09
1,2020-01-02,Home Depot,Sacramento,Final Mile,34,136,6153.01
2,2020-01-02,Home Depot,Chicago,Final Mile,43,215,15691.72
3,2020-01-02,Home Depot,Detroit,Final Mile,41,164,6490.39
4,2020-01-02,Home Depot,Atlanta,Final Mile,44,220,10069.65


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126255 entries, 0 to 126254
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   WorkDate        126255 non-null  object 
 1   Customer        126255 non-null  object 
 2   Location        126255 non-null  object 
 3   BusinessType    126255 non-null  object 
 4   OrderCount      126255 non-null  int64  
 5   NumberOfPieces  126255 non-null  int64  
 6   TotalRevenue    126255 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 6.7+ MB


In [6]:
df.describe()

Unnamed: 0,OrderCount,NumberOfPieces,TotalRevenue
count,126255.0,126255.0,126255.0
mean,27.487759,137.447871,2619.589379
std,23.390942,119.446285,3732.441225
min,1.0,3.0,26.02
25%,11.0,52.0,688.61
50%,20.0,100.0,1450.79
75%,37.0,185.0,2998.13
max,149.0,1015.0,64318.07


In [7]:
df.isnull().sum()

WorkDate          0
Customer          0
Location          0
BusinessType      0
OrderCount        0
NumberOfPieces    0
TotalRevenue      0
dtype: int64

In [8]:
df['WorkDate'] = pd.to_datetime(df['WorkDate'])

# Feature engineering
df['DayOfWeek'] = df['WorkDate'].dt.dayofweek

In [9]:
# Select features and target
features = ['Customer', 'Location', 'BusinessType', 'OrderCount', 'NumberOfPieces', 'DayOfWeek']
target = 'TotalRevenue'

In [10]:
# One-hot encode categorical features
categorical_features = ['Customer', 'Location', 'BusinessType']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = ohe.fit_transform(df[categorical_features])
encoded_cat_df = pd.DataFrame(encoded_cats, columns=ohe.get_feature_names_out(categorical_features))

In [11]:
# Combine with numerical features
numerical_df = df[['OrderCount', 'NumberOfPieces', 'DayOfWeek']].reset_index(drop=True)
X = pd.concat([numerical_df, encoded_cat_df], axis=1)
y = df[target]

In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train model
model = RandomForestRegressor(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [15]:
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared Score: {r2:.2f}")

Mean Absolute Error: 7.79
R-squared Score: 1.00


In [16]:
# Save model and encoder
joblib.dump(model, "revenue_model.pkl")
joblib.dump(ohe, "encoder.pkl")

['encoder.pkl']