# Data Preparation and Analysis Draft
## Flow change by hour, by day, by month, by year, by route



In [1]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st
from scipy import stats

In [2]:
df_flow = (pd
 .read_csv('raw_data/d12_hr_MLHVflow_201902.csv', parse_dates=['timestamp'])
 .assign(timestamp=lambda df_: df_['timestamp'].dt.tz_localize(None))
 .assign(year=lambda df_: df_['timestamp'].dt.year, 
         month=lambda df_: df_['timestamp'].dt.month, 
         day=lambda df_: df_['timestamp'].dt.day, 
         hour=lambda df_: df_['timestamp'].dt.hour)
)

In [32]:
df_flow['route'].unique()

array([133, 405,  57, 605,  22,  55,  91,   5,  73, 241, 261,  74, 142,
         1])

In [69]:
# a function to clean the raw flow df
# use route number as input
# convert to pivot table with index of station_id and abs_pm, column of hour
# drop the outliers (z-score > 3)
# values are mean of flow throughout 28 days in 2019.02
def route_df(route_num):
    route_raw = df_flow[(df_flow.route==route_num)]
    pivot_raw = pd.pivot_table(route_raw, values='total_flow', index=['station_id', 'abs_pm'], columns=['hour'], aggfunc=np.mean).sort_values('abs_pm')
    route_raw_new = pivot_raw[(np.abs(stats.zscore(pivot_raw)) < 3)].dropna().reset_index()
    route_flow_df = route_raw[route_raw['station_id'].isin(route_raw_new['station_id'])]
    route_flow_df = route_flow_df.assign(day_of_week=lambda x: (x['timestamp'].dt.dayofweek))
    route_flow_df['weekday'] = route_flow_df['day_of_week'].apply(lambda x: 'weekday' if x < 6 else 'weekends')
    return route_flow_df
route_74 = route_df(74)

In [70]:
route_74.head(5)

Unnamed: 0,timestamp,station_id,district,route,direction,lane_type,station_length,samples,pct_observed,total_flow,...,state_pm,abs_pm,lat,lng,year,month,day,hour,day_of_week,weekday
1426,2019-02-01,1218570,12,74,W,ML,0.506,0,0,496.0,...,0.241,0.241,33.503721,-117.654477,2019,2,1,0,4,weekday
1427,2019-02-01,1218576,12,74,E,ML,0.253,0,0,496.0,...,1.277,1.277,33.512477,-117.640482,2019,2,1,0,4,weekday
1436,2019-02-01,1219087,12,74,E,ML,0.35,0,0,496.0,...,8.0,7.996,33.539611,-117.54873,2019,2,1,0,4,weekday
1437,2019-02-01,1219095,12,74,W,ML,0.35,0,0,496.0,...,8.0,7.996,33.539611,-117.548729,2019,2,1,0,4,weekday
1443,2019-02-01,1219115,12,74,E,ML,0.506,0,0,496.0,...,0.241,0.241,33.503721,-117.654478,2019,2,1,0,4,weekday


## Plot flow for each route by day and time

- x=hour; y=day


In [81]:
# plot both direction flow by hour
def weekday_pivot_df(route_df):
    direction_a = route_df['direction'].unique()[0]
    direction_b = route_df['direction'].unique()[1]
    route_df_direction_a_pivot = pd.pivot_table(route_df[route_df['direction']==direction_a], values='total_flow', index=['hour'], columns=['weekday'], aggfunc=np.mean).reset_index()
    route_df_direction_b_pivot = pd.pivot_table(route_df[route_df['direction']==direction_b], values='total_flow', index=['hour'], columns=['weekday'], aggfunc=np.mean).reset_index()
    route_df_pivot = route_df_direction_a_pivot.add(route_df_direction_b_pivot, fill_value=0)
    route_df_pivot['hour'] = route_df_pivot['hour']/2
    route_df_pivot = route_df_pivot.round(0)
    return route_df_pivot
route_74_pivot = weekday_pivot_df(route_74)
route_74_pivot

weekday,hour,weekday.1,weekends
0,0.0,890.0,1505.0
1,1.0,555.0,931.0
2,2.0,431.0,660.0
3,3.0,439.0,421.0
4,4.0,1010.0,406.0
5,5.0,2446.0,731.0
6,6.0,3675.0,1307.0
7,7.0,4160.0,1880.0
8,8.0,4128.0,2591.0
9,9.0,4062.0,3416.0


In [82]:
fig = px.line(route_74_pivot, x='hour', y=route_74_pivot.columns, range_x=(0, 23))
fig.show()