In [1]:
import pandas as pd

In [7]:
query = """
SELECT 
    DATETIME_TRUNC(pickup_datetime, hour) as pickup_hour,
    count(*) as cnt
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015` 
WHERE EXTRACT(MONTH from pickup_datetime) = 1
GROUP BY pickup_hour
ORDER BY pickup_hour
LIMIT 10
"""

df = pd.read_gbq(query=query, dialect='standard', project_id='geultto')

### 기본 전처리
- SQL(BigQuery)로 가능한 부분은 SQL에서 => 로컬 파이썬보다 좋은 성능
- SQL에서 힘든 전처리 => Python에서

In [8]:
df

Unnamed: 0,pickup_hour,cnt
0,2015-01-01 00:00:00,28312
1,2015-01-01 01:00:00,31707
2,2015-01-01 02:00:00,28068
3,2015-01-01 03:00:00,24288
4,2015-01-01 04:00:00,17081
5,2015-01-01 05:00:00,9112
6,2015-01-01 06:00:00,6637
7,2015-01-01 07:00:00,6011
8,2015-01-01 08:00:00,5586
9,2015-01-01 09:00:00,6977


In [10]:
%%time
feature_query = """
WITH base_data AS 
(
  SELECT nyc_taxi.*, gis.* except (zip_code_geom)
  FROM (
    SELECT *
    FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`
    WHERE 
        EXTRACT(MONTH from pickup_datetime) = 1
        and pickup_latitude  <= 90 and pickup_latitude >= -90
    ) as nyc_taxi
  JOIN (
    SELECT zip_code, state_code, state_name, city, county, zip_code_geom
    FROM `bigquery-public-data.geo_us_boundaries.zip_codes`
    WHERE state_code='NY'
    ) as gis 
  ON st_contains(zip_code_geom, st_geogpoint(pickup_longitude,	pickup_latitude))
)

SELECT 
    zip_code,
    DATETIME_TRUNC(pickup_datetime, hour) as pickup_hour,
    EXTRACT(MONTH FROM pickup_datetime) AS month,
    EXTRACT(DAY FROM pickup_datetime) AS day,
    CAST(format_datetime('%u', pickup_datetime) AS INT64) -1 AS weekday,
    EXTRACT(HOUR FROM pickup_datetime) AS hour,
    count(*) as cnt
FROM base_data 
GROUP BY zip_code, pickup_hour, month, day, weekday, hour
ORDER BY pickup_hour


"""

feature_df = pd.read_gbq(query=feature_query, dialect='standard', project_id='geultto')

CPU times: user 3.48 s, sys: 97 ms, total: 3.57 s
Wall time: 10.8 s


In [11]:
feature_df

Unnamed: 0,zip_code,pickup_hour,month,day,weekday,hour,cnt
0,10038,2015-01-01 00:00:00,1,1,3,0,158
1,10111,2015-01-01 00:00:00,1,1,3,0,28
2,10028,2015-01-01 00:00:00,1,1,3,0,835
3,11205,2015-01-01 00:00:00,1,1,3,0,52
4,10044,2015-01-01 00:00:00,1,1,3,0,9
5,10023,2015-01-01 00:00:00,1,1,3,0,971
6,10011,2015-01-01 00:00:00,1,1,3,0,1615
7,10461,2015-01-01 00:00:00,1,1,3,0,2
8,11206,2015-01-01 00:00:00,1,1,3,0,100
9,10022,2015-01-01 00:00:00,1,1,3,0,1392


### Linear Regerssion
- Baseline 모델
    - 작업한 모델 중 제일 성능이 낮을 모델, 비교 대상 

Unnamed: 0,pickup_hour,cnt
0,2015-01-01 00:00:00,28312
1,2015-01-01 01:00:00,31707
2,2015-01-01 02:00:00,28068
3,2015-01-01 03:00:00,24288
4,2015-01-01 04:00:00,17081
5,2015-01-01 05:00:00,9112
6,2015-01-01 06:00:00,6637
7,2015-01-01 07:00:00,6011
8,2015-01-01 08:00:00,5586
9,2015-01-01 09:00:00,6977
