# Convert to a standardized version of dataframe

A converter to convert data frame to a standard format for carpoolsim:
1. Traffic network links
2. Traffic network nodes
3. Traffic TAZs (polygons)
4. Traffic demands 

In [1]:
import time
import copy
import os
import sys

import numpy as np
import pandas as pd
import geopandas as gpd

%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = None  # display all columns

In [2]:
# need to set this environmental path everytime you moves the project root folder
os.environ['project_root'] = '/Users/geekduck/Dropbox/gt_survey'

sys.path.append(
    os.environ['project_root']
)

In [3]:
from carpoolsim.basic_settings import *

In [46]:
# load traffic network data
tazs_raw = gpd.read_file(os.environ["taz"])
df_nodes_raw = gpd.read_file(os.environ['network_nodes'])
df_links_raw = gpd.read_file(os.environ['network_links'])

In [49]:
from carpoolsim.dataclass.traffic_network import (
    TrafficNetworkLink,
    TrafficNetworkNode,
    TrafficAnalysisZone,
)

In [50]:
# clean node object
node_name_map = {
    "N": "nid",
    "lat": "lat",
    "lon": "lon",
    "X": "x",
    "Y": "y",
    "geometry": "geometry"
}
df_nodes_raw = df_nodes_raw.rename(columns=node_name_map)

df_nodes_lst = []
for index, row in df_nodes_raw.iterrows():
    df_nodes_lst.append(
        TrafficNetworkNode(
            row["nid"], row["lon"], row["lat"],
            row["x"], row["y"], row["geometry"])
    )

In [51]:
# clean link object
link_name_map = {
    "A": "a",
    "B": "b",
    "NAME": "name",
    "DISTANCE": "distance",
    "FACTYPE": "factype",
    "geometry": "geometry",
    "SPEED_LIMI": "speed_limit"
}
df_links_raw = df_links_raw.rename(columns=link_name_map)
df_links_raw["a"] = df_links_raw["a"].astype(str)
df_links_raw["b"] = df_links_raw["b"].astype(str)
df_links_raw["a_b"] = df_links_raw["a"] + "_" + df_links_raw["b"]

df_links_lst = []
for index, row in df_links_raw.iterrows():
    df_links_lst.append(
        TrafficNetworkLink(
            row["a"], row["b"], row["a_b"], row["name"],
            row["distance"], row["factype"], row["speed_limit"],
            row["geometry"]
        )
    )

In [52]:
# clean taz object
taz_name_map = {
    "OBJECTID": "taz_id",
    "COUNTY": "group_id",
    "geometry": "geometry",
}
tazs_raw = tazs_raw.rename(columns=taz_name_map)


tazs_lst = []
for index, row in tazs_raw.iterrows():
    tazs_lst.append(
        TrafficAnalysisZone(
            row["taz_id"],
            row["group_id"],
            row["geometry"],
        )
    )

In [53]:
# convert data sets base to data frames
df_nodes = gpd.GeoDataFrame(
    df_nodes_lst,
    crs="EPSG:4326",
)
df_links = gpd.GeoDataFrame(
    df_links_lst,
    crs=CRS,
)
tazs = gpd.GeoDataFrame(
    tazs_lst,
    crs="EPSG:4326",
)

In [54]:
df_links = df_links.to_crs(
    crs="EPSG:4326"
)

In [55]:
# store cleaned results to shapefiles
tazs.to_file(
    os.path.join(os.environ['data_inputs'], "cleaned", "tazs.shp")
)
df_nodes.to_file(
    os.path.join(os.environ['data_inputs'], "cleaned", "nodes.shp")
)
df_links.to_file(
    os.path.join(os.environ['data_inputs'], "cleaned", "links.shp")
)

  df_links.to_file(


### Prepare pnr stations

In [20]:
from carpoolsim.dataclass.parking_lots import (
    ParkAndRideStation
)
from carpoolsim.dataclass.travel_demands import (
    TripDemand
)

In [21]:
pnr_lots = gpd.read_file(os.environ['parking_lots'])

In [22]:
pnr_lots.head(2)

Unnamed: 0,OBJECTID,NAME,CITY,SPACES,GlobalID,StreetNumb,PostalCode,StreetName,Publish,geometry
0,76,MARTA - INDIAN CREEK STATION,Stone Mountain,2350,{46FF823F-E6E7-4597-B5B7-3BEC503B47A8},3901,30083,Durham Park Road,Yes,POINT (-84.22903 33.76839)
1,78,Hewatt Road park & ride,Snellville,125,{CCDA4DB1-A240-4013-A8F9-23741EAD99FF},2191,30039,Hewatt Rd,Yes,POINT (-84.05962 33.83902)


In [23]:
pnr_name_map = {
    "OBJECTID": "station_id",
    "NAME": "name",
    "SPACES": "capacity",
    "geometry": "geometry"
}
pnr_lots = pnr_lots.rename(columns=pnr_name_map)

pnr_lots['lon'] = pnr_lots.geometry.x
pnr_lots['lat'] = pnr_lots.geometry.y

In [24]:
pnr_lst = []
for index, row in pnr_lots.iterrows():
    pnr_lst.append(
        ParkAndRideStation(
            row["station_id"], row["name"],
            row["lon"], row["lat"], row["capacity"],
            row["geometry"]
        )
    )

In [25]:
pnrs = gpd.GeoDataFrame(pnr_lst)

In [27]:
pnrs.to_file(
    os.path.join(os.environ['data_inputs'], "cleaned", "pnrs.shp"),
    crs = "EPSG:4326"
)

## Prepare traffic demands
For trip, need to add:
- spatial information
- temporal information

In [28]:
gt_survey = pd.read_csv(os.environ['trip_demands'], index_col=0)

In [29]:
gt_survey["trip_id"] = gt_survey.index

In [30]:
# a special input
gt_survey.sample(2)

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration..in.seconds.,Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,Q66,Q73,Q73_2_TEXT,Q73_5_TEXT,Q2,Q68,Q67,Q3,Q3_6_TEXT,Q5,Q45,ZIPCODE,Q56_1,Q56_2,Q56_3,Q56_4,Q56_5,Q10,Q11,Q12_1,Q12_2,Q12_3,Q12_4,Q12_5,Q12_6,Q12_7,Q12_8,Q12_9,Q12_10,Q12_11,Q12_12,Q12_13,Q12_18,Q12_14,Q12_15,Q12_16,Q12_17,Q12_17_TEXT,Q13_1,Q13_2,Q13_3,Q13_1_TEXT,Q13_2_TEXT,Q13_3_TEXT,Q14_1,Q14_2,Q14_3,Q14_1_TEXT,Q14_2_TEXT,Q14_3_TEXT,Q15,Q16,Q16_3_TEXT,Q69,Q69_6_TEXT,Q17,Q18,Q19,Q55,Q20_1,Q20_2,Q20_3,Q20_4,Q20_5,Q20_6,Q20_7,Q20_8,Q20_9,Q20_10,Q20_11,Q20_13,Q20_14,Q20_16,Q20_17,Q20_18,Q20_19,Q20_20,Q20_21,Q20_22,Q20_22_TEXT,Q25,Q25_14_TEXT,Q64,Q26,Q26_16_TEXT,Q27,Q28,Q29,Q30_1,Q30_2,Q30_3,Q30_4,Q30_5,Q30_6,Q30_7,Q30_8,Q30_12,Q30_10,Q30_11,Q32_1,Q32_2,Q32_3,Q32_4,Q32_5,Q34_1,Q34_2,Q34_3,Q34_4,Q34_5,Q34_6,Q34_7,Q34_8,Q34_9,Q34_10,Q34_11,Q34_12,Q34_13,Q34_14,Q34_15,Q34_16,Q47,Q59,Q51,Q49,Q49_5_TEXT,Q48,Q50,Q61,Q62,Q65,Q35,Q38,trip_id
148,10/31/2022 15:46,10/31/2022 15:57,IP Address,100,683,True,10/31/2022 15:57,R_3RjQUGjHCx7r4bP,email,EN,No,,,,I live off campus and work or attend class at ...,,,Staff member,,20-39 minutes,11 to 20 miles,30364.0,MARTA bus,MARTA rail,MARTA rail,MARTA rail,MARTA rail,12:00pm,6:30pm,,,,,,,,,,,,,,,,,,,,,,,,,,Route #,Route #,Route #,82.0,82.0,82.0,College Park,Midtown station,,"Rideshare (Uber/Lyft, etc.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","Yes, and I have used this program","Yes, and I have used this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","Yes, and I have used this program","No, I am not aware of this program","No, I am not aware of this program","Yes, and I have used this program","Yes, and I have used this program",,No,,,,,,,,,,Yes,148
1637,11/9/2022 15:14,11/9/2022 15:15,IP Address,100,50,True,11/9/2022 15:15,R_1JCHicBxPeZw8Gr,email,EN,No,,,,I live on campus in campus housing (residence ...,,Eighth Street,Undergraduate student,,,,30332.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Yes,1637


In [31]:
# filter out nan values for zip code
gt_survey = gt_survey[gt_survey['ZIPCODE'].notna()]
gt_survey["ZIPCODE"] = gt_survey["ZIPCODE"].astype(int)

In [32]:
gt_survey.Q2.unique()

array(['I live off campus and work or attend class at the main Georgia Tech campus in Midtown Atlanta',
       'I live on campus in campus housing (residence halls)',
       'I live on campus in Greek housing'], dtype=object)

In [33]:
# Q10: home to work time
# Q11: work to home time
gt_survey.Q10.unique()

array(['7:30am', '8:00am', '8:30am', '11:00am', '9:00am', '10:00am',
       '7:00am', '9:30am', '8:00pm', '9:00pm', '9:30pm', '7:00pm',
       '11:30am', '6:00am', nan, '2:00pm', '6:30am', '12:00pm', '10:30am',
       '12:30pm', '12:00am - 5:30am', '3:30pm', '6:30pm', '1:00pm',
       '1:30pm', '4:00pm', '7:30pm', '5:00pm', '3:00pm', '10:30pm',
       '8:30pm', '2:30pm'], dtype=object)

In [34]:
filt = (gt_survey.Q10.notna())
gt_survey = gt_survey[gt_survey.Q10.notna()]
print(gt_survey.shape)

(1545, 143)


In [35]:
# If earlier than 5:30 AM, change it to 5:30 AM
gt_survey['Q10'] = gt_survey['Q10'].str.replace(
    "12:00am - 5:30am",
    "5:30am"
)

In [36]:
# 2023-01-01 is the pseudo date
depart_time = "2023-01-01 " + gt_survey.Q10

gt_survey['depart_time'] = pd.to_datetime(
    depart_time,
    format=r"%Y-%m-%d %I:%M%p"
)

In [37]:
def extract_newmins(df_row):
    tm = (df_row["depart_time"] - pd.to_datetime("2023-01-01")
         ).total_seconds() / 60
    return tm

gt_survey['newmin'] = gt_survey['depart_time'] - pd.to_datetime("2023-01-01")
gt_survey['newmin'] = gt_survey.apply(extract_newmins, axis=1)
gt_survey.sample(2)

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration..in.seconds.,Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,Q66,Q73,Q73_2_TEXT,Q73_5_TEXT,Q2,Q68,Q67,Q3,Q3_6_TEXT,Q5,Q45,ZIPCODE,Q56_1,Q56_2,Q56_3,Q56_4,Q56_5,Q10,Q11,Q12_1,Q12_2,Q12_3,Q12_4,Q12_5,Q12_6,Q12_7,Q12_8,Q12_9,Q12_10,Q12_11,Q12_12,Q12_13,Q12_18,Q12_14,Q12_15,Q12_16,Q12_17,Q12_17_TEXT,Q13_1,Q13_2,Q13_3,Q13_1_TEXT,Q13_2_TEXT,Q13_3_TEXT,Q14_1,Q14_2,Q14_3,Q14_1_TEXT,Q14_2_TEXT,Q14_3_TEXT,Q15,Q16,Q16_3_TEXT,Q69,Q69_6_TEXT,Q17,Q18,Q19,Q55,Q20_1,Q20_2,Q20_3,Q20_4,Q20_5,Q20_6,Q20_7,Q20_8,Q20_9,Q20_10,Q20_11,Q20_13,Q20_14,Q20_16,Q20_17,Q20_18,Q20_19,Q20_20,Q20_21,Q20_22,Q20_22_TEXT,Q25,Q25_14_TEXT,Q64,Q26,Q26_16_TEXT,Q27,Q28,Q29,Q30_1,Q30_2,Q30_3,Q30_4,Q30_5,Q30_6,Q30_7,Q30_8,Q30_12,Q30_10,Q30_11,Q32_1,Q32_2,Q32_3,Q32_4,Q32_5,Q34_1,Q34_2,Q34_3,Q34_4,Q34_5,Q34_6,Q34_7,Q34_8,Q34_9,Q34_10,Q34_11,Q34_12,Q34_13,Q34_14,Q34_15,Q34_16,Q47,Q59,Q51,Q49,Q49_5_TEXT,Q48,Q50,Q61,Q62,Q65,Q35,Q38,trip_id,depart_time,newmin
1677,11/9/2022 15:38,11/9/2022 15:42,IP Address,100,248,True,11/9/2022 15:42,R_2fjsVcdnbAkJOP6,email,EN,No,,,,I live off campus and work or attend class at ...,,,Graduate/postdoc student,,10-19 minutes,fewer than 4 miles,30318,"Drive alone (car, truck, motorcycle, moped)",Walk,"Drive alone (car, truck, motorcycle, moped)",Walk,"Drive alone (car, truck, motorcycle, moped)",10:00am,5:00pm,Don't have anyone to ride/carpool with,,,Enjoy the ride/prefer to drive,,,,,,,,,,,,Safety concerns,,,,,,,,,,,,,,,,,,,,,No,,,,,,,,,,,,,"Increased financial incentives (e.g., transit ...",,Need to save money,,,,,,,,,,Walk,,,,,,,,,,,,,,,,,,,,,,,,"No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program",less than $50,No,,,,,,,,,,Yes,1677,2023-01-01 10:00:00,600.0
1807,11/3/2022 9:18,11/3/2022 9:19,IP Address,67,87,False,11/10/2022 8:18,R_b2fN9w6gRWhK9fr,email,EN,No,,,,I live off campus and work or attend class at ...,,,Graduate/postdoc student,,20-39 minutes,4 to 10 miles,30326,Other,,,,,9:30am,6:30pm,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1807,2023-01-01 09:30:00,570.0


In [38]:
# package for querying zip code
import pgeocode

locator = pgeocode.Nominatim("us")
location = locator.query_postal_code(30310)
print((location.latitude, location.longitude))

(33.7278, -84.4232)


In [39]:
# extract longitude/latitude for all positions!
def extract_lon_lat(df_row):
    location = locator.query_postal_code(df_row["ZIPCODE"])
    return pd.Series({
        "longitude": location.longitude,
        "latitude": location.latitude
    })

gt_survey[["ori_lon", "ori_lat"]] = gt_survey.apply(extract_lon_lat, axis=1)
display(gt_survey.sample(2))

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration..in.seconds.,Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,Q66,Q73,Q73_2_TEXT,Q73_5_TEXT,Q2,Q68,Q67,Q3,Q3_6_TEXT,Q5,Q45,ZIPCODE,Q56_1,Q56_2,Q56_3,Q56_4,Q56_5,Q10,Q11,Q12_1,Q12_2,Q12_3,Q12_4,Q12_5,Q12_6,Q12_7,Q12_8,Q12_9,Q12_10,Q12_11,Q12_12,Q12_13,Q12_18,Q12_14,Q12_15,Q12_16,Q12_17,Q12_17_TEXT,Q13_1,Q13_2,Q13_3,Q13_1_TEXT,Q13_2_TEXT,Q13_3_TEXT,Q14_1,Q14_2,Q14_3,Q14_1_TEXT,Q14_2_TEXT,Q14_3_TEXT,Q15,Q16,Q16_3_TEXT,Q69,Q69_6_TEXT,Q17,Q18,Q19,Q55,Q20_1,Q20_2,Q20_3,Q20_4,Q20_5,Q20_6,Q20_7,Q20_8,Q20_9,Q20_10,Q20_11,Q20_13,Q20_14,Q20_16,Q20_17,Q20_18,Q20_19,Q20_20,Q20_21,Q20_22,Q20_22_TEXT,Q25,Q25_14_TEXT,Q64,Q26,Q26_16_TEXT,Q27,Q28,Q29,Q30_1,Q30_2,Q30_3,Q30_4,Q30_5,Q30_6,Q30_7,Q30_8,Q30_12,Q30_10,Q30_11,Q32_1,Q32_2,Q32_3,Q32_4,Q32_5,Q34_1,Q34_2,Q34_3,Q34_4,Q34_5,Q34_6,Q34_7,Q34_8,Q34_9,Q34_10,Q34_11,Q34_12,Q34_13,Q34_14,Q34_15,Q34_16,Q47,Q59,Q51,Q49,Q49_5_TEXT,Q48,Q50,Q61,Q62,Q65,Q35,Q38,trip_id,depart_time,newmin,ori_lon,ori_lat
1348,10/31/2022 15:25,10/31/2022 15:30,IP Address,96,332,False,11/7/2022 14:25,R_2vjqEgePOYicFek,email,EN,No,,,,I live off campus and work or attend class at ...,,,Staff member,,20-39 minutes,11 to 20 miles,30080,Telecommute (work from home),Telecommute (work from home),Telecommute (work from home),"Drive alone (car, truck, motorcycle, moped)","Drive alone (car, truck, motorcycle, moped)",8:30am,4:30pm,,,,Enjoy the ride/prefer to drive,,,,,,Transit options are not available where I live,Transit costs too much,Transit takes too long,,,,,Anything else takes too much time,,,,,,,,,,,,,,,,,,,,No,,,,,,,,,,,,,,,,,,,,,,Nothing would alter my current driving habits,,,,,,,,,,,,,,,,,,,,,,,,,,,"No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, and I have used this program","Yes, and I have used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program",$50 to $100,No,,,,,,,,,,Yes,1348,2023-01-01 08:30:00,510.0,-84.5023,33.8796
108,10/31/2022 15:28,10/31/2022 15:32,IP Address,100,249,True,10/31/2022 15:32,R_T5urCrDBDuN6tvH,email,EN,No,,,,I live off campus and work or attend class at ...,,,Staff member,,20-39 minutes,11 to 20 miles,30238,"Drive alone (car, truck, motorcycle, moped)","Drive alone (car, truck, motorcycle, moped)","Drive alone (car, truck, motorcycle, moped)","Drive alone (car, truck, motorcycle, moped)","Drive alone (car, truck, motorcycle, moped)",7:30am,4:30pm,,Don't like to depend on others for carpooling,,Enjoy the ride/prefer to drive,Irregular work schedule,,,Need my car at work for personal business,Need to leave quickly in an emergency,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,,,,,,Availability of emergency ride home,"Expanded regional transit options (MARTA, ligh...",,Express shuttle to popular destinations,,,,,,,,,,,,,,,,MARTA rail,,,,,,,,,,,,,,,,,,,,,,,,"Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","No, I am not aware of this program","No, I am not aware of this program","No, I am not aware of this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","No, I am not aware of this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program","Yes, but I have NOT used this program",$100 to $150,No,,,,,,,,,,No,108,2023-01-01 07:30:00,450.0,-84.3797,33.4944


In [40]:
# add Geogia Tech as final destination
gt_survey["dest_lon"] = -84.397971
gt_survey["dest_lat"] = 33.775766

In [41]:
# clean results
trip_name_map = {
    "ori_lon": "orig_lon",
    "ori_lat": "orig_lat",
    "dest_lon": "dest_lon",
    "dest_lat": "dest_lat",
    "newmin": "new_min"
}

trips = gt_survey.rename(columns=trip_name_map)

In [42]:
trips = gpd.GeoDataFrame(
    trips,
    geometry=gpd.points_from_xy(
        trips.orig_lon,
        trips.orig_lat
    ),
    crs="EPSG:4326"
)

In [43]:
trip_lst = []
for index, row in trips.iterrows():
    trip_lst.append(
        TripDemand(
            row["trip_id"],
            row["orig_lon"], row["orig_lat"],
            row["dest_lon"], row["dest_lat"],
            row["new_min"], row["geometry"],
        )
    )

In [44]:
trips = gpd.GeoDataFrame(trip_lst)
print(trips.shape)

(1545, 7)


In [45]:
trips.to_file(
    os.path.join(
        os.environ['data_inputs'], 
        "cleaned", 
        "trips.shp"
    ),
    crs="EPSG:4326"
)