In [2]:
# use Terminal to upgrade jupyter server if needed:
# pip install --upgrade jupyter_server 

import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import random
from datetime import datetime, timedelta

### Test and display the location data

In [3]:
# Create a sample geographic area (e.g., a city)
city_center = Point(-122.4194, 37.7749)  # San Francisco coordinates
city_radius = 0.1  # Roughly 11km

# Generate random delivery locations
longs = np.random.uniform(city_center.x - city_radius, city_center.x + city_radius, 1000)
lats = np.random.uniform(city_center.y - city_radius, city_center.y + city_radius, 1000)

# prepare data in Geoseries structire using to create a GeoDataFrame
locations = gpd.GeoSeries([Point(xy) for xy in zip(longs, lats)])

In [5]:
locations

0      POINT (-122.37157 37.75201)
1      POINT (-122.45448 37.72195)
2      POINT (-122.38033 37.76449)
3      POINT (-122.38550 37.76048)
4      POINT (-122.37689 37.69876)
                  ...             
995    POINT (-122.36033 37.85258)
996    POINT (-122.42743 37.79911)
997    POINT (-122.48432 37.81366)
998    POINT (-122.41165 37.79200)
999    POINT (-122.36973 37.79125)
Length: 1000, dtype: geometry

### Test and display the time data

In [12]:
 # Generate delivery times
num_days = 30
num_deliveries = 1000
start_datetime = datetime.strptime('2024-01-01', '%Y-%m-%d')
datetimes = [start_datetime + timedelta(
    days=random.randint(0, num_days-1),
    hours=random.randint(8, 20),
    minutes=random.randint(0, 59)
) for _ in range(num_deliveries)]

# Generate delivery durations (in minutes)
durations = np.random.normal(30, 10, num_deliveries).astype(int)
durations = np.clip(durations, 5, 120)  # Clip between 5 and 120 minutes to set any value less than 5 is set to 5, and any value greater than 120 is set to 120.

In [17]:
datetimes[:10]

[datetime.datetime(2024, 1, 15, 15, 8),
 datetime.datetime(2024, 1, 7, 8, 7),
 datetime.datetime(2024, 1, 18, 9, 2),
 datetime.datetime(2024, 1, 19, 16, 51),
 datetime.datetime(2024, 1, 17, 15, 51),
 datetime.datetime(2024, 1, 24, 10, 31),
 datetime.datetime(2024, 1, 19, 11, 56),
 datetime.datetime(2024, 1, 9, 9, 30),
 datetime.datetime(2024, 1, 9, 18, 34),
 datetime.datetime(2024, 1, 28, 8, 42)]

In [22]:
durations.shape # nparray

(1000,)

In [23]:
# Create weather conditions
weather_conditions = np.random.choice(['Clear', 'Cloudy', 'Rainy'], num_deliveries, p=[0.6, 0.3, 0.1])

# Create traffic conditions
traffic_conditions = np.random.choice(['Light', 'Moderate', 'Heavy'], num_deliveries, p=[0.5, 0.3, 0.2])

In [27]:
weather_conditions[:10], traffic_conditions[:10]

(array(['Clear', 'Clear', 'Cloudy', 'Clear', 'Clear', 'Clear', 'Clear',
        'Clear', 'Clear', 'Rainy'], dtype='<U6'),
 array(['Heavy', 'Light', 'Moderate', 'Light', 'Moderate', 'Light',
        'Light', 'Light', 'Light', 'Moderate'], dtype='<U8'))

In [28]:
# Create DataFrame
df = pd.DataFrame({
    'delivery_id': range(1, num_deliveries + 1),
    'datetime': datetimes,
    'duration_minutes': durations,
    'weather': weather_conditions,
    'traffic': traffic_conditions,
    'latitude': lats,
    'longitude': longs
})

In [29]:
df.head()

Unnamed: 0,delivery_id,datetime,duration_minutes,weather,traffic,latitude,longitude
0,1,2024-01-15 15:08:00,34,Clear,Heavy,37.752005,-122.371568
1,2,2024-01-07 08:07:00,25,Clear,Light,37.721948,-122.454479
2,3,2024-01-18 09:02:00,22,Cloudy,Moderate,37.764489,-122.380332
3,4,2024-01-19 16:51:00,8,Clear,Light,37.760484,-122.385498
4,5,2024-01-17 15:51:00,40,Clear,Moderate,37.698757,-122.376894


In [32]:
# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=locations, crs='EPSG:4326')
gdf.head()

Unnamed: 0,delivery_id,datetime,duration_minutes,weather,traffic,latitude,longitude,geometry
0,1,2024-01-15 15:08:00,34,Clear,Heavy,37.752005,-122.371568,POINT (-122.37157 37.75201)
1,2,2024-01-07 08:07:00,25,Clear,Light,37.721948,-122.454479,POINT (-122.45448 37.72195)
2,3,2024-01-18 09:02:00,22,Cloudy,Moderate,37.764489,-122.380332,POINT (-122.38033 37.76449)
3,4,2024-01-19 16:51:00,8,Clear,Light,37.760484,-122.385498,POINT (-122.38550 37.76048)
4,5,2024-01-17 15:51:00,40,Clear,Moderate,37.698757,-122.376894,POINT (-122.37689 37.69876)


## Define a function to generate delivery data

In [33]:
def generate_delivery_data(num_deliveries=1000, start_date='2024-01-01', num_days=30):
    # Create a sample geographic area (e.g., a city)
    city_center = Point(-122.4194, 37.7749)  # San Francisco coordinates
    city_radius = 0.1  # Roughly 11km

    # Generate random delivery locations
    longs = np.random.uniform(city_center.x - city_radius, city_center.x + city_radius, num_deliveries)
    lats = np.random.uniform(city_center.y - city_radius, city_center.y + city_radius, num_deliveries)
    locations = gpd.GeoSeries([Point(xy) for xy in zip(longs, lats)])

    # Generate delivery times
    start_datetime = datetime.strptime(start_date, '%Y-%m-%d')
    datetimes = [start_datetime + timedelta(
        days=random.randint(0, num_days-1),
        hours=random.randint(8, 20),
        minutes=random.randint(0, 59)
    ) for _ in range(num_deliveries)]

    # Generate delivery durations (in minutes)
    durations = np.random.normal(30, 10, num_deliveries).astype(int)
    durations = np.clip(durations, 5, 120)  # Clip between 5 and 120 minutes

    # Create weather conditions
    weather_conditions = np.random.choice(['Clear', 'Cloudy', 'Rainy'], num_deliveries, p=[0.6, 0.3, 0.1])

    # Create traffic conditions
    traffic_conditions = np.random.choice(['Light', 'Moderate', 'Heavy'], num_deliveries, p=[0.5, 0.3, 0.2])

    # Create DataFrame
    df = pd.DataFrame({
        'delivery_id': range(1, num_deliveries + 1),
        'datetime': datetimes,
        'duration_minutes': durations,
        'weather': weather_conditions,
        'traffic': traffic_conditions,
        'latitude': lats,
        'longitude': longs
    })

    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=locations, crs='EPSG:4326')

    return gdf

### Create a new local folder named 'data' to save the generated data

In [38]:
import os

# Create the 'data' directory if it doesn't exist
os.makedirs('data', exist_ok=True)

In [39]:
# Generate the data
delivery_data = generate_delivery_data()

# Save to CSV (you can also save to GeoJSON or other formats if needed)
delivery_data.to_csv('data/synthetic_delivery_data_010124.csv', index=False)

print("Synthetic delivery data generated and saved to 'synthetic_delivery_data.csv'")
print(delivery_data.head())

Synthetic delivery data generated and saved to 'synthetic_delivery_data.csv'
   delivery_id            datetime  duration_minutes weather   traffic  \
0            1 2024-01-21 17:19:00                15   Clear  Moderate   
1            2 2024-01-20 13:58:00                22   Clear  Moderate   
2            3 2024-01-30 17:31:00                11   Clear     Light   
3            4 2024-01-11 14:22:00                33  Cloudy     Light   
4            5 2024-01-08 19:27:00                22   Clear     Light   

    latitude   longitude                     geometry  
0  37.798718 -122.516837  POINT (-122.51684 37.79872)  
1  37.682273 -122.386426  POINT (-122.38643 37.68227)  
2  37.677185 -122.491511  POINT (-122.49151 37.67719)  
3  37.845155 -122.500552  POINT (-122.50055 37.84516)  
4  37.717295 -122.513115  POINT (-122.51312 37.71730)  
