## Step 1: Import basic libraries

In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle

## Step 2: Loading the dataset and basic exploring

In [6]:
df = pd.read_csv(r"D:\Desktop\data analytics projects\Amazon delivery time\amazon_delivery.csv")

In [12]:
df.head(6)

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys
5,fxuu788413734,22,4.8,17.431668,78.408321,17.461668,78.438321,2022-03-11,21:20:00,21:30:00,Cloudy,Jam,motorcycle,Urban,130,Toys


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43739 entries, 0 to 43738
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order_ID         43739 non-null  object 
 1   Agent_Age        43739 non-null  int64  
 2   Agent_Rating     43685 non-null  float64
 3   Store_Latitude   43739 non-null  float64
 4   Store_Longitude  43739 non-null  float64
 5   Drop_Latitude    43739 non-null  float64
 6   Drop_Longitude   43739 non-null  float64
 7   Order_Date       43739 non-null  object 
 8   Order_Time       43739 non-null  object 
 9   Pickup_Time      43739 non-null  object 
 10  Weather          43648 non-null  object 
 11  Traffic          43739 non-null  object 
 12  Vehicle          43739 non-null  object 
 13  Area             43739 non-null  object 
 14  Delivery_Time    43739 non-null  int64  
 15  Category         43739 non-null  object 
dtypes: float64(5), int64(2), object(9)
memory usage: 5.3+ MB


In [16]:
df.describe()

Unnamed: 0,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Delivery_Time
count,43739.0,43685.0,43739.0,43739.0,43739.0,43739.0,43739.0
mean,29.567137,4.63378,17.21096,70.661177,17.459031,70.821842,124.905645
std,5.815155,0.334716,7.764225,21.475005,7.34295,21.153148,51.915451
min,15.0,1.0,-30.902872,-88.366217,0.01,0.01,10.0
25%,25.0,4.5,12.933298,73.170283,12.985996,73.28,90.0
50%,30.0,4.7,18.55144,75.898497,18.633626,76.002574,125.0
75%,35.0,4.9,22.732225,78.045359,22.785049,78.104095,160.0
max,50.0,6.0,30.914057,88.433452,31.054057,88.563452,270.0


In [18]:
df.shape

(43739, 16)

In [20]:
df.isnull().sum()

Order_ID            0
Agent_Age           0
Agent_Rating       54
Store_Latitude      0
Store_Longitude     0
Drop_Latitude       0
Drop_Longitude      0
Order_Date          0
Order_Time          0
Pickup_Time         0
Weather            91
Traffic             0
Vehicle             0
Area                0
Delivery_Time       0
Category            0
dtype: int64

## Step 3: Basic Data Cleaning

In [23]:
simple_df = df[['Agent_Rating', 'Agent_Age', 'Weather', 'Traffic', 
                'Vehicle', 'Area', 'Category', 'Delivery_Time']].copy()

In [25]:
simple_df.head(6)

Unnamed: 0,Agent_Rating,Agent_Age,Weather,Traffic,Vehicle,Area,Category,Delivery_Time
0,4.9,37,Sunny,High,motorcycle,Urban,Clothing,120
1,4.5,34,Stormy,Jam,scooter,Metropolitian,Electronics,165
2,4.4,23,Sandstorms,Low,motorcycle,Urban,Sports,130
3,4.7,38,Sunny,Medium,motorcycle,Metropolitian,Cosmetics,105
4,4.6,32,Cloudy,High,scooter,Metropolitian,Toys,150
5,4.8,22,Cloudy,Jam,motorcycle,Urban,Toys,130


In [27]:
simple_df.isnull().sum()

Agent_Rating     54
Agent_Age         0
Weather          91
Traffic           0
Vehicle           0
Area              0
Category          0
Delivery_Time     0
dtype: int64

In [31]:

if simple_df['Agent_Rating'].sum().any():
    df.fillna(simple_df['Agent_Rating'].median(), inplace=True)

In [37]:
simple_df['Weather'].fillna('Unknown', inplace=True)



In [39]:
simple_df.isnull().sum()

Agent_Rating     0
Agent_Age        0
Weather          0
Traffic          0
Vehicle          0
Area             0
Category         0
Delivery_Time    0
dtype: int64

## Step 4: Convert categorical data using simple pandas get_dummies

In [42]:
clean_df = pd.get_dummies(simple_df, columns=['Weather', 'Traffic', 'Vehicle', 'Area', 'Category'])

## Step 5: Prepare data for modeling

In [45]:
X = clean_df.drop('Delivery_Time', axis=1)
y = clean_df['Delivery_Time']

## Step 6: Train-Test Split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## Step 7: Train Random Forest Model 

In [72]:
model = RandomForestRegressor(random_state=30)
model.fit(X_train, y_train)

## Step 8: Evaluate Mean Absolutle Error of  Model

In [74]:
predictions = model.predict(X_test)
print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions):.2f} hours")


Mean Absolute Error: 22.67 hours


## Step 9: Save the Model

In [55]:
with open('simple_delivery_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [56]:
# Save the columns used during training
model_columns = list(X.columns)
with open('model_columns.pkl', 'wb') as f:
    pickle.dump(model_columns, f)