## Explorative Data Analysis

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import numpy as np

pd.options.display.max_columns = None  # display all columns

In [2]:
# for auto-reloading
%load_ext autoreload
%autoreload 2    

In [3]:
# need to set this environmental path everytime you moves the project root folder
os.environ['project_root'] = '/Users/geekduck/Dropbox/gt_survey'

sys.path.append(os.environ['project_root'])

In [4]:
from carpoolsim.basic_settings import *

## Load data
- trip inputs
- taz data

In [20]:
gt_survey = gpd.read_file(
    os.path.join(os.environ['data_inputs'], "cleaned", "trips.shp"),
)
print(gt_survey.shape)
gt_survey.sample(2)

(1545, 7)


Unnamed: 0,trip_id,orig_lon,orig_lat,dest_lon,dest_lat,new_min,geometry
331,486,-84.3859,34.0408,-84.397971,33.775766,600.0,POINT (-84.38590 34.04080)
1523,2437,-84.2947,33.4124,-84.397971,33.775766,960.0,POINT (-84.29470 33.41240)


In [21]:
# load taz
tazs = gpd.read_file(
    os.path.join(os.environ['data_inputs'], "cleaned", "tazs.shp")
)

print(tazs.shape)
tazs.sample(2)

(5873, 3)


Unnamed: 0,taz_id,group_id,geometry
1469,2209,Cobb,"POLYGON ((-84.53929 33.98188, -84.53950 33.981..."
3496,2942,Gwinnett,"POLYGON ((-83.98644 34.02149, -83.98643 34.020..."


In [23]:
from shapely import Point

# assign TAZ given long/lat
def get_taz_given_lon_lat(
    df_row, lon_col, lat_col, tazs=tazs
):
    point = Point(df_row[lon_col], df_row[lat_col])
    taz_geoms = tazs["geometry"].tolist()
    filt = [taz_geom.contains(point) for taz_geom in taz_geoms]
    
    if sum(filt) == 0:
        the_taz_info = -1, ""
    else:
        the_taz_info = tazs.loc[filt, ["taz_id", "group_id"]].iloc[0]
    return the_taz_info

gt_survey[["orig_taz", "O_region"]] = gt_survey.apply(
    get_taz_given_lon_lat,
    axis=1,
    lon_col="orig_lon", 
    lat_col="orig_lat",
)

gt_survey[["dest_taz", "D_region"]] = gt_survey.apply(
    get_taz_given_lon_lat,
    axis=1,
    lon_col="dest_lon", 
    lat_col="dest_lat",
)

In [24]:
gt_survey.sample(3)

Unnamed: 0,trip_id,orig_lon,orig_lat,dest_lon,dest_lat,new_min,geometry,orig_taz,O_region,dest_taz,D_region
621,941,-84.295,33.7699,-84.397971,33.775766,630.0,POINT (-84.29500 33.76990),1628,DeKalb,484,Fulton
831,1287,-84.3958,33.9335,-84.397971,33.775766,510.0,POINT (-84.39580 33.93350),228,Fulton,484,Fulton
798,1227,-84.3958,33.9335,-84.397971,33.775766,720.0,POINT (-84.39580 33.93350),228,Fulton,484,Fulton


In [25]:
print(gt_survey.shape)
filt = (gt_survey["orig_taz"] != -1)
gt_survey = gt_survey.loc[filt, :]
print(gt_survey.shape)

(1545, 11)
(1519, 11)


In [26]:
print(gt_survey.O_region.unique())
print(gt_survey.D_region.unique())

['Fulton' 'DeKalb' 'Carroll' 'Cobb' 'Fayette' 'Gwinnett' 'Douglas'
 'Paulding' 'Clayton' 'Cherokee' 'Forsyth' 'Rockdale' 'Hall' 'Henry'
 'Coweta' 'Newton' 'Barrow' 'Walton']
['Fulton']


In [27]:
# store results to data_outputs
gt_survey.to_file(
    os.path.join(
        os.environ['data_outputs'], 
        "step1_gt_survey", 
        "trips.shp"
    )
)