In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
df = pd.read_csv('synthetic_delivery_data.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
print("Original data:")
print(df.head())

In [None]:

# Temporal Features
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Create time bins
df['time_bin'] = pd.cut(df['hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'])

print("\nTemporal features:")
print(df[['datetime', 'hour', 'day_of_week', 'is_weekend', 'time_bin']].head())

In [None]:
# Geospatial Features
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs='EPSG:4326')

# Calculate distance from city center (San Francisco)
city_center = gpd.GeoDataFrame({'geometry': [gpd.points_from_xy([-122.4194], [37.7749])[0]]}, crs='EPSG:4326')
gdf['distance_from_center'] = gdf.to_crs(crs='EPSG:3857').distance(city_center.to_crs(crs='EPSG:3857'))

print("\nGeospatial features:")
print(gdf[['latitude', 'longitude', 'distance_from_center']].head())

In [None]:
# Weather and Traffic Features
le_weather = LabelEncoder()
le_traffic = LabelEncoder()

gdf['weather_encoded'] = le_weather.fit_transform(gdf['weather'])
gdf['traffic_encoded'] = le_traffic.fit_transform(gdf['traffic'])

print("\nWeather and traffic features:")
print(gdf[['weather', 'weather_encoded', 'traffic', 'traffic_encoded']].head())


In [None]:
# Additional Features
gdf['month'] = gdf['datetime'].dt.month
gdf['day'] = gdf['datetime'].dt.day
gdf['time_since_midnight'] = gdf['datetime'].dt.hour * 60 + gdf['datetime'].dt.minute
gdf['day_of_year'] = gdf['datetime'].dt.dayofyear

print("\nAdditional features:")
print(gdf[['month', 'day', 'time_since_midnight', 'day_of_year']].head())

In [None]:

# Final Feature Set
features = ['hour', 'day_of_week', 'is_weekend', 'distance_from_center', 'weather_encoded', 'traffic_encoded',
            'month', 'day', 'time_since_midnight', 'day_of_year']
target = 'duration_minutes'

X = gdf[features]
y = gdf[target]

print("\nFinal feature set:")
print(X.head())
print("\nTarget variable:")
print(y.head())


In [None]:
# Feature Correlation Analysis
correlation_matrix = X.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('feature_correlation_heatmap.png')
plt.close()

print("\nFeature correlation heatmap saved as 'feature_correlation_heatmap.png'")

# Save Processed Data
gdf.to_csv('processed_delivery_data.csv', index=False)
print("\nProcessed data saved to 'processed_delivery_data.csv'")