# Data Preprocessing Tools

### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from geopy.distance import geodesic

### Importing the dataset

In [2]:
df = pd.read_csv("input.csv")  # Replace with your actual CSV file path

### Data Preprocessing & Cleaning

In [3]:
# Remove spaces from all string values in the DataFrame
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [4]:
#Convert Date & Time Fields
df["Order_Date"] = pd.to_datetime(df["Order_Date"], format='%d-%m-%Y', errors='coerce')
df["Time_Orderd"] = pd.to_datetime(df["Time_Orderd"], format="%H:%M:%S", errors='coerce')
df["Time_Order_picked"] = pd.to_datetime(df["Time_Order_picked"], format="%H:%M:%S", errors='coerce')


In [5]:
# Clean the Weatherconditions column
df['Weatherconditions'] = df['Weatherconditions'].str.replace('conditions ', '').str.strip()

# Drop NaN values
df = df.dropna(subset=['Weatherconditions'])

### Calculate Order-to-Pickup Time (Pickup Delay)

In [6]:
df["Pickup_Delay"] = (df["Time_Order_picked"] - df["Time_Orderd"]).dt.total_seconds() / 60  # Convert to minutes


### Convert Categorical Data to Numerical

In [7]:
#def haversine_distance(Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude):
 #   return geodesic((Restaurant_latitude, Restaurant_longitude), (Delivery_location_latitude, Delivery_location_longitude)).km
# Function to Calculate Distance
def haversine_distance(row):
    start = (row["Restaurant_latitude"], row["Restaurant_longitude"])
    end = (row["Delivery_location_latitude"], row["Delivery_location_longitude"])
    return geodesic(start, end).km

# Apply Function to DataFrame
df["Distance_km"] = df.apply(haversine_distance, axis=1)

# Display Results
print(df.head())

       ID Delivery_person_ID Delivery_person_Age Delivery_person_Ratings  \
0  0x4607     INDORES13DEL02                  37                     4.9   
1  0xb379     BANGRES18DEL02                  34                     4.5   
2  0x5d6d     BANGRES19DEL01                  23                     4.4   
3  0x7a6a    COIMBRES13DEL02                  38                     4.7   
4  0x70a2     CHENRES12DEL01                  32                     4.6   

   Restaurant_latitude  Restaurant_longitude  Delivery_location_latitude  \
0            22.745049             75.892471                   22.765049   
1            12.913041             77.683237                   13.043041   
2            12.914264             77.678400                   12.924264   
3            11.003669             76.976494                   11.053669   
4            12.972793             80.249982                   13.012793   

   Delivery_location_longitude Order_Date         Time_Orderd  ...  \
0               

In [8]:
df[['Road_traffic_density', 'Type_of_vehicle','Festival', 'Weatherconditions','Pickup_Delay']]

Unnamed: 0,Road_traffic_density,Type_of_vehicle,Festival,Weatherconditions,Pickup_Delay
0,High,motorcycle,No,Sunny,11.933333
1,Jam,scooter,No,Stormy,6.200000
2,Low,motorcycle,No,Sandstorms,15.816667
3,Medium,motorcycle,No,Sunny,8.900000
4,High,scooter,No,Cloudy,11.333333
...,...,...,...,...,...
20970,Jam,scooter,No,Sunny,3.166667
20971,Low,motorcycle,No,Fog,6.333333
20972,High,motorcycle,No,Cloudy,7.450000
20973,Low,motorcycle,No,Stormy,-1427.883333


In [9]:
traffic_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Jam': 3}
df['Road_traffic_density_encoded'] = df['Road_traffic_density'].map(traffic_mapping)


### Convert Categorical Data to Numerical

In [10]:
label_encoders = {}
categorical_columns = ["Festival", "City"]
    
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Store encoders for future use


In [11]:
df[['Road_traffic_density', 'Festival', 'Weatherconditions','Pickup_Delay']]

Unnamed: 0,Road_traffic_density,Festival,Weatherconditions,Pickup_Delay
0,High,1,Sunny,11.933333
1,Jam,1,Stormy,6.200000
2,Low,1,Sandstorms,15.816667
3,Medium,1,Sunny,8.900000
4,High,1,Cloudy,11.333333
...,...,...,...,...
20970,Jam,1,Sunny,3.166667
20971,Low,1,Fog,6.333333
20972,High,1,Cloudy,7.450000
20973,Low,1,Stormy,-1427.883333


### Handle Missing Data & Outliers

In [12]:
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill missing numerical values with median
df.fillna("Unknown", inplace=True)  # Fill missing categorical values with "Unknown"


In [13]:
df[['Pickup_Delay']]

Unnamed: 0,Pickup_Delay
0,11.933333
1,6.200000
2,15.816667
3,8.900000
4,11.333333
...,...
20970,3.166667
20971,6.333333
20972,7.450000
20973,-1427.883333


In [14]:
#from scipy import stats
#df = df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)]  # Remove outliers beyond 3 std deviations
from scipy import stats
# List of columns you want to check for outliers
selected_columns = ['Time_taken(min)', 'Distance_km']

# Remove rows where any selected column has a Z-score > 3 or < -3
df = df[(np.abs(stats.zscore(df[selected_columns])) < 3).all(axis=1)]


### Normalize & Scale Features

In [15]:
scaler = StandardScaler()
numeric_features = [ "Distance_km"]

df[numeric_features] = scaler.fit_transform(df[numeric_features])


In [16]:
print(df.head())

       ID Delivery_person_ID Delivery_person_Age Delivery_person_Ratings  \
0  0x4607     INDORES13DEL02                  37                     4.9   
1  0xb379     BANGRES18DEL02                  34                     4.5   
2  0x5d6d     BANGRES19DEL01                  23                     4.4   
3  0x7a6a    COIMBRES13DEL02                  38                     4.7   
4  0x70a2     CHENRES12DEL01                  32                     4.6   

   Restaurant_latitude  Restaurant_longitude  Delivery_location_latitude  \
0            22.745049             75.892471                   22.765049   
1            12.913041             77.683237                   13.043041   
2            12.914264             77.678400                   12.924264   
3            11.003669             76.976494                   11.053669   
4            12.972793             80.249982                   13.012793   

   Delivery_location_longitude Order_Date          Time_Orderd  ...  \
0              

In [17]:
df.to_csv("data.csv", index=False)