# Calculate Deadheading Miles and Emissions

___

In [None]:
import pandas as pd
import numpy as np
import math
from scipy.spatial.distance import cdist, euclidean

______

In [None]:
rides_a = pd.read_csv('../data/Rides_DataA.csv')
rides_b = pd.read_csv('../data/Rides_DataB.csv')
df_rides = pd.merge(rides_a, rides_b, on='RIDE_ID')
df_rides["started_on"] = pd.to_datetime(df_rides["started_on"], utc=True)
df_rides["started_on_date"] = pd.to_datetime(df_rides["started_on"].dt.date)
df_rides["started_on_time"] = df_rides["started_on"].dt.hour * 3600 + df_rides[
    "started_on"].dt.minute * 60 + df_rides["started_on"].dt.second 
df_rides["started_on_hour"] = df_rides["started_on"].dt.hour

In [None]:
def geometric_median(X, eps=1e-5):
    y = np.mean(X, 0)

    while True:
        D = cdist(X, [y])
        nonzeros = (D != 0)[:, 0]

        Dinv = 1 / D[nonzeros]
        Dinvs = np.sum(Dinv)
        W = Dinv / Dinvs
        T = np.sum(W * X[nonzeros], 0)

        num_zeros = len(X) - np.sum(nonzeros)
        if num_zeros == 0:
            y1 = T
        elif num_zeros == len(X):
            return y
        else:
            R = (T - y) * Dinvs
            r = np.linalg.norm(R)
            rinv = 0 if r == 0 else num_zeros/r
            y1 = max(0, 1-rinv)*T + min(1, rinv)*y

        if euclidean(y, y1) < eps:
            return y1

        y = y1

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2, earth_radius=6371):
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
    
    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [None]:
def deadheading_distance(arr):
    arr["shifted_end_location_lat"] = arr["end_location_lat"].shift()
    arr["shifted_end_location_long"] = arr["end_location_long"].shift()
    def calculate_dh_distance(row): 
        a = haversine_distance(row["shifted_end_location_lat"], row["shifted_end_location_long"], 
                                      row["start_location_lat"], row["start_location_long"]) * 1.4
        driving_distance_to_rider = row["driving_distance_to_rider"] / 1000 * 1.4
        if pd.isna(driving_distance_to_rider): 
            driving_distance_to_rider = a
        distance_from_previous_ride = np.sqrt(a ** 2 + driving_distance_to_rider ** 2)
        return driving_distance_to_rider + distance_from_previous_ride
    def calculate_ride_distance(row): 
        return haversine_distance(row["start_location_lat"], row["start_location_long"], 
                                 row["end_location_lat"], row["end_location_long"]) * 1.4
    return [arr.apply(calculate_dh_distance, axis=1).sum(), arr.apply(calculate_ride_distance, axis=1).sum()]

In [None]:
df_rides_dh = df_rides.groupby(["driver_id", "started_on_date"]).apply(
    deadheading_distance).reset_index()

In [None]:
ser = pd.DataFrame(df_rides_dh[0].to_list())
ser.columns = ["ride_deadheading", "ride_mileage"]
df_rides_dh = df_rides_dh.join(ser)
df_rides_dh = df_rides_dh.drop(columns = 0)

In [None]:
df_rides_dh[["driver_id", "started_on_date", "ride_deadheading"]].to_csv("actual_per_ride_deadheading.csv", index=False)

In [None]:
df_rides_grouped = df_rides.groupby(["driver_id", "started_on_date"])["started_on_time"].agg(["idxmin", "idxmax"])
starting_trips = df_rides.loc[df_rides_grouped["idxmin"]][["driver_id","started_on_date", "start_location_lat", "start_location_long"]]
ending_trips = df_rides.loc[df_rides_grouped["idxmax"]][["driver_id","started_on_date", "end_location_lat", "end_location_long"]]
df_residences = starting_trips.groupby("driver_id")[["start_location_lat", "start_location_long"]].apply(geometric_median)
df_residences.columns = ["residence_lat", "residence_long"]

In [None]:
df_commute = pd.merge(starting_trips, ending_trips, on=["driver_id", "started_on_date"], suffixes=["_start", "_end"])
df_commute = pd.merge(df_commute, df_residences, on="driver_id")

df_commute["commute_distance_start"] = haversine_distance(df_commute["start_location_lat"], 
                                                    df_commute["start_location_long"],
                                                    df_commute["residence_lat"],
                                                    df_commute["residence_long"])
df_commute["commute_distance_start"] *= 1.4 # distance adjustment factor
df_commute["commute_distance_end"] = haversine_distance(df_commute["end_location_lat"],
                                                    df_commute["end_location_long"],
                                                    df_commute["residence_lat"],
                                                    df_commute["residence_long"])
df_commute["commute_distance_end"] *= 1.4
df_commute = df_commute.reset_index()

In [None]:
df_deadheading = pd.merge(df_commute, df_rides_dh, on=["driver_id", "started_on_date"])

In [None]:
emissions_df = pd.read_csv("../data/epa_emissions_database.csv")
rides_b = pd.read_csv('../data/Rides_DataB.csv')
rides_b["make"] = rides_b["make"].str.strip("b\'\"")
rides_b["model"] = rides_b["model"].str.strip("b\'\"")
rides_b["year"] = rides_b["year"].str.strip("b\'\"")
rides_b["year"] = pd.to_numeric(rides_b["year"])
emissions_df = emissions_df.rename(columns={"Make": "make", "Model": "model", "Year": "year", 
                                            "Tailpipe CO2 in Grams/Mile (FT1)": "emissions"})
emissions_df = emissions_df.groupby(["make", "model", "year"])["emissions"].median()
emissions_df = pd.DataFrame(emissions_df)
emissions_df = emissions_df[emissions_df["emissions"] >= 0]
emissions_df_merged = pd.merge(rides_b, emissions_df, on=["make", "model", "year"], how = "left")
emissions_df_merged = emissions_df_merged[["driver_id", "make", "model", "year", "emissions"]]
emissions_df_merged = emissions_df_merged.drop_duplicates()
emissions_df_merged["emissions"] = emissions_df_merged["emissions"].fillna(emissions_df_merged["emissions"].median())

In [None]:
df_dh_emissions = pd.merge(df_deadheading, emissions_df_merged, on = ["driver_id"])
df_dh_emissions["deadheading"] = df_dh_emissions["commute_distance_start"] + df_dh_emissions[
    "commute_distance_end"] + df_dh_emissions["ride_deadheading"]
df_dh_emissions["total_mileage"] = df_dh_emissions["deadheading"] + df_dh_emissions["ride_mileage"]

In [None]:
df_dh_emissions = df_dh_emissions[df_dh_emissions["ride_deadheading"].between(df_dh_emissions[
    "ride_deadheading"].quantile(.027), df_dh_emissions["ride_deadheading"].quantile(.973))]
df_dh_emissions = df_dh_emissions[df_dh_emissions["deadheading"].between(df_dh_emissions[
    "deadheading"].quantile(.027), df_dh_emissions["deadheading"].quantile(.973))]

In [None]:
deadheading_mileage = df_dh_emissions["deadheading"].sum()
ride_deadheading_mileage = df_dh_emissions["ride_deadheading"].sum()
total_mileage = df_dh_emissions["total_mileage"].sum()

In [None]:
percentage_deadheading = deadheading_mileage/total_mileage

In [None]:
ride_percentage_deadheading = ride_deadheading_mileage/total_mileage

In [None]:
deadheading_emissions = (df_dh_emissions["deadheading"] * df_dh_emissions["emissions"] * 0.621371).sum()
ride_deadheading_emissions = (df_dh_emissions["ride_deadheading"] * df_dh_emissions["emissions"] * 0.621371).sum()
total_emissions = (df_dh_emissions["total_mileage"] * df_dh_emissions["emissions"] * 0.621371).sum()

In [None]:
percentage_deadheading = deadheading_emissions/total_emissions

In [None]:
ride_percentage_deadheading = ride_deadheading_emissions/total_emissions

___

Note: These numbers are off by 1-2% than those presented in the dataset, since there was an adjustment for outliers. However, this marginal difference does not impact the conclusions or consequent calculations in any way. 

___