In [3]:
# don't forget about day
# if month year exists but couldn't find

import pandas as pd
import psycopg2
import datetime
from psycopg2 import sql
from sqlalchemy import create_engine, text
from dateutil import relativedelta
from loguru import logger

PICKUP_DATETIME_COL = "tpep_pickup_datetime"
TOTAL_AMOUNT_COL = "total_amount"

year = 2020
month = 1
date = datetime.datetime(year, month, 10)
start_date = date.replace(day=1)
end_date = start_date + relativedelta.relativedelta(months=1)
formatted_month_year = date.strftime("%Y-%m")

df = pd.read_parquet(f'yellow_tripdata_{year}-01.parquet', engine='pyarrow')
df = df[
        (df[PICKUP_DATETIME_COL] >= start_date) & (df[PICKUP_DATETIME_COL] < end_date)
    ]

df["formatted_pickup_date"] = df[PICKUP_DATETIME_COL].dt.strftime("%Y-%m-%d")
df["hour"] = df[PICKUP_DATETIME_COL].dt.hour
df["day"] = df[PICKUP_DATETIME_COL].dt.day
df["processed_time"] = datetime.datetime.now()


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6404796 entries, 0 to 6405007
Data columns (total 23 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [5]:
zone_df = pd.read_csv('taxi+_zone_lookup.csv')
zone_df

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [6]:
zone_df['Zone'].value_counts()

Governor's Island/Ellis Island/Liberty Island    3
Corona                                           2
Newark Airport                                   1
Ocean Hill                                       1
Parkchester                                      1
                                                ..
Fordham South                                    1
Forest Hills                                     1
Forest Park/Highland Park                        1
Fort Greene                                      1
NV                                               1
Name: Zone, Length: 261, dtype: int64

In [7]:
display(zone_df[zone_df['Zone'] == "Governor's Island/Ellis Island/Liberty Island"])
display(zone_df[zone_df['Zone'] == "Corona"])
# 261 unique()

Unnamed: 0,LocationID,Borough,Zone,service_zone
102,103,Manhattan,Governor's Island/Ellis Island/Liberty Island,Yellow Zone
103,104,Manhattan,Governor's Island/Ellis Island/Liberty Island,Yellow Zone
104,105,Manhattan,Governor's Island/Ellis Island/Liberty Island,Yellow Zone


Unnamed: 0,LocationID,Borough,Zone,service_zone
55,56,Queens,Corona,Boro Zone
56,57,Queens,Corona,Boro Zone


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6404796 entries, 0 to 6405007
Data columns (total 23 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [94]:
agg_df = (df
          .groupby(['day',"hour", "PULocationID","DOLocationID" ])
          .agg(trip_count=(PICKUP_DATETIME_COL, 'count'), avg_amount=(TOTAL_AMOUNT_COL, 'mean'))).reset_index()
agg_df

Unnamed: 0,day,hour,PULocationID,DOLocationID,trip_count,avg_amount
0,1,0,3,147,1,25.000000
1,1,0,4,4,4,5.550000
2,1,0,4,7,1,33.360000
3,1,0,4,13,1,21.950000
4,1,0,4,37,1,26.750000
...,...,...,...,...,...,...
1717694,31,23,264,262,2,20.775000
1717695,31,23,264,264,52,19.965962
1717696,31,23,264,265,1,276.960000
1717697,31,23,265,264,1,200.300000


In [93]:
# TODO: Continue from here, Refer to example below of how to crafting a proper long-array metrics 
# e.g. for hour in range(24) for day in range(31)

# outcome: a nested array where it has 31 arrays, frontend just have to access the respective index 
# (say user pick 15/01), the backend will return data from all the dates of Jan, and you do this agg_data[15-1]
# for day in range(31), if there is no respective day, we just append an empty
# if there is, we make sure the array has 24 elements, if there is missing hour, we put zero

# lambda row: [row['trip_count'][row['hour'].index(h)] if h in row['hour'] else 0 for h in range(24)]

# Aggregation
agg_df = (df
          .groupby(['day',"hour", "PULocationID","DOLocationID" ])
          .agg(trip_count=(PICKUP_DATETIME_COL, 'count'), avg_amount=(TOTAL_AMOUNT_COL, 'mean'))).reset_index()



grouped_agg_df = agg_df.groupby(['PULocationID', 'DOLocationID']).agg({'day':list, 'hour': list, 'trip_count': list, 'avg_amount': list}).reset_index()


def build_long_array_metrics(row, metric_col):
    monthly_arr = []
    metrics_length = len(row[metric_col])
    i = 0
    for day in range(1, 32):
        day_arr = []
        if day in row['day']:
            for hour in range(24):
                trip_count = 0
                if i < metrics_length and row['hour'][i] == hour:
                    trip_count = row['trip_count'][i]
                    i += 1
                day_arr.append(str(trip_count))
        else:
            day_arr = ['0' for i in range(24)]
        monthly_arr.append("{" + ",".join(day_arr) + "}")
    if i != metrics_length:
        print('hailat')
    return "{" + ",".join(monthly_arr)+ "}"


raw_data = {
    'month_year': [formatted_month_year]*len(grouped_agg_df),
    'PULocationID': grouped_agg_df['PULocationID'],
    'DOLocationID':grouped_agg_df['DOLocationID'],
    'trip_count_array': grouped_agg_df.apply(lambda row: build_long_array_metrics(row, 'trip_count'), axis=1).tolist(),
    'avg_amount_array': grouped_agg_df.apply(lambda row: build_long_array_metrics(row, 'trip_count'), axis=1).tolist()

}
result_df = pd.DataFrame(raw_data)
result_df

Unnamed: 0,month_year,PULocationID,DOLocationID,trip_count_array,avg_amount_array
0,2020-01,1,1,"{{0,0,0,0,1,0,4,1,1,2,1,1,2,1,3,3,2,1,2,1,2,0,...","{{0,0,0,0,1,0,4,1,1,2,1,1,2,1,3,3,2,1,2,1,2,0,..."
1,2020-01,1,50,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,..."
2,2020-01,1,68,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
3,2020-01,1,138,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
4,2020-01,1,140,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
...,...,...,...,...,...
31272,2020-01,265,259,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
31273,2020-01,265,261,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
31274,2020-01,265,263,"{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
31275,2020-01,265,264,"{{2,1,4,3,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...","{{2,1,4,3,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,..."


In [90]:
result_df.iloc[0]['trip_count_array']

'{{0,0,0,0,1,0,4,1,1,2,1,1,2,1,3,3,2,1,2,1,2,0,0,0},{0,1,0,1,0,0,3,0,2,1,1,1,3,2,5,2,4,9,3,0,0,0,0,0},{1,0,2,0,0,1,1,2,1,0,2,0,1,3,3,3,3,0,1,3,1,0,0,0},{0,0,1,1,0,1,1,2,0,0,1,0,0,2,3,6,3,1,1,0,0,0,0,0},{1,0,0,1,0,0,0,3,1,0,3,1,3,1,1,4,0,4,3,2,0,0,0,0},{0,0,0,0,0,0,0,2,0,0,1,1,0,4,1,1,2,1,1,0,0,0,0,0},{0,0,1,0,0,1,2,1,0,0,0,3,0,0,1,1,1,5,0,0,0,0,0,1},{0,0,1,2,0,0,0,1,2,1,0,1,2,1,1,0,3,1,0,1,0,0,0,0},{0,0,0,0,0,0,0,1,0,2,0,0,0,0,2,0,1,2,0,0,9,0,0,1},{0,0,0,0,0,0,1,0,0,0,1,1,1,1,2,4,3,1,0,0,0,0,0,0},{0,0,0,0,0,1,2,0,1,0,0,0,0,1,1,4,0,3,0,1,0,0,0,0},{0,0,0,0,0,0,0,1,2,1,1,2,4,1,1,2,0,5,0,1,0,0,0,2},{0,0,0,0,0,0,2,1,2,0,0,0,0,0,0,1,1,2,1,1,0,0,0,0},{0,0,1,0,1,1,2,0,0,1,1,1,0,1,2,2,6,2,5,0,0,0,0,0},{0,0,1,0,0,0,1,1,2,1,1,1,1,1,10,9,3,1,2,0,0,1,0,0},{0,0,0,0,0,2,1,0,1,2,1,1,2,0,0,3,2,2,2,2,1,0,0,0},{0,0,0,0,1,0,1,1,2,1,1,2,2,0,4,6,3,3,1,4,0,0,0,1},{0,0,0,0,0,1,2,1,2,0,1,1,2,0,4,0,0,1,1,0,0,0,0,0},{0,0,0,2,0,0,1,3,0,1,1,0,1,1,1,1,6,3,2,1,0,0,0,0},{0,0,0,0,0,1,1,1,1,3,1,0,2,0,0,1,1,0,1,0,2,0,0,

In [95]:
datetime.datetime("2021-08-12")

TypeError: an integer is required (got type str)

In [96]:
date = datetime.datetime(2020, 2, 12)
date.strftime("%Y-%m-%d")

'2020-02-12'

In [89]:
a = json'[["a"]]'
list(a)

['[', '[', 'a', ']', ']']

In [141]:
import pandas as pd

# Sample DataFrame
data = {'formatted_pickup_date': [f'2022-01-{i}' for i in range(1, 9)],
        'LocationID': [1] * 8,
        'hour': [i % 24 for i in range(1, 9)],  # Adjust hour values to be in the range of 0-23
        'trip_count': [5, 8, 12, 15, 18, 22, 25, 18]}

_df = pd.DataFrame(data)
display(_df)
# Assuming formatted_pickup_date is already a datetime object
_df['formatted_pickup_date'] = pd.to_datetime(df['formatted_pickup_date'])
_df['month'] = _df['formatted_pickup_date'].dt.to_period('M')
# _df['day'] = _df['formatted_pickup_date'].dt.to_period('d')


# Group by month and LocationID, and aggregate trip_count into a list
grouped_df = _df.groupby(['month', 'LocationID']).agg({'hour': list, 'trip_count': list}).reset_index()
display(grouped_df)
# Create a DataFrame with each row representing monthly aggregated data
result_df = pd.DataFrame({
    'month': grouped_df['month'],
    'LocationID': grouped_df['LocationID'],
    'trip_count_array': [grouped_df.apply(lambda row: [row['trip_count'][row['hour'].index(h)] if h in row['hour'] else 0 for h in range(24)], axis=1).tolist()]
})

# Display the resulting DataFrame
display(result_df)
display(result_df.iloc[0]['trip_count_array'])


Unnamed: 0,formatted_pickup_date,LocationID,hour,trip_count
0,2022-01-1,1,1,5
1,2022-01-2,1,2,8
2,2022-01-3,1,3,12
3,2022-01-4,1,4,15
4,2022-01-5,1,5,18
5,2022-01-6,1,6,22
6,2022-01-7,1,7,25
7,2022-01-8,1,8,18


Unnamed: 0,month,LocationID,hour,trip_count
0,2020-01,1,"[1, 2, 3, 4, 5, 6, 7]","[5, 8, 12, 15, 18, 22, 25]"


Unnamed: 0,month,LocationID,trip_count_array
0,2020-01,1,"[[0, 5, 8, 12, 15, 18, 22, 25, 0, 0, 0, 0, 0, ..."


[[0, 5, 8, 12, 15, 18, 22, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]