In [29]:
import pandas as pd
import pickle
from datetime import datetime
import holidays

path = 'C:/Users/Administrator/practicumProject2/'

# Merge taxi data and poi-taxi zone table into a data frame indexed by (poi, time window)

Fisrtly, load the POI-Zone table and taxi data.

In [30]:
# load poi_zone table
with open(path+'poi_zone.pickle', 'rb') as file:
    poi_zone = pickle.load(file)

print(poi_zone.dtypes)

poi_id         int32
latitude     float64
zone_id        int32
longitude    float64
dtype: object


Load each year's taxi data, and merge them into a data frame

In [31]:
# load the data of 2022
with open(path + 'zone_time_taxi2022.pickle', 'rb') as file:
    zone_time_taxi2022 = pickle.load(file)

# load the data of 2023
with open(path + 'zone_time_taxi2023.pickle', 'rb') as file:
    zone_time_taxi2023 = pickle.load(file)

# merge all the yearly data
zone_time_taxi = pd.concat([zone_time_taxi2022, zone_time_taxi2023], axis=0)


print('The shape of zone_time_taxi2022:', zone_time_taxi2022.shape, '\n')
print('The shape of zone_time_taxi2023:', zone_time_taxi2023.shape, '\n')
print('The shape of zone_time_taxi:', zone_time_taxi.shape, '\n')

The shape of zone_time_taxi2022: (578160, 4) 

The shape of zone_time_taxi2023: (190080, 4) 

The shape of zone_time_taxi: (768240, 4) 



Attach POI to taxi data and clean the data. Thus we obtain the taxi-busyness table indexed by (POI, time window) 

In [32]:
# sum the pickup and dropoff
zone_time_taxi['taxi_all'] = zone_time_taxi['numPu'] + zone_time_taxi['numDo']

# merge taxi data and POI-Zone table by column 'zone_id'.
poi_time_taxi = pd.merge(poi_zone, zone_time_taxi, on = 'zone_id', how = 'left')

# take out the useful columns
poi_time_taxi = poi_time_taxi[['poi_id', 'timeWindow', 'taxi_all']]

# drop the rows containing missing value
poi_time_taxi = poi_time_taxi.dropna()

# set the type of column 'numAll' as 'int'
poi_time_taxi['taxi_all'] = poi_time_taxi['taxi_all'].astype('int')  


print('The data shape is', poi_time_taxi.shape, '\n')

print('The information of data:')
print(poi_time_taxi.dtypes)

The data shape is (2642280, 3) 

The information of data:
poi_id                 int32
timeWindow    datetime64[ns]
taxi_all               int32
dtype: object


# Merge subway data and poi-subway station table into a data frame indexed by (poi, time window)

Fisrtly, load the POI-Station table and subway data.

In [33]:
# load the poi_station table
poi_station = pd.read_excel(path + 'poi_station.xlsx', usecols=['poi_id', 'station_id'])

# load the station_time_subway data
with open(path + 'station_time_subway.pickle', 'rb') as file:
    station_time_subway = pickle.load(file)


print('The data shape is', station_time_subway.shape, '\n')

print('The information of data:')
print(station_time_subway.dtypes)

The data shape is (325366, 3) 

The information of data:
station_id            object
timeWindow    datetime64[ns]
riderNum               int32
dtype: object


Attach POI to subway data and clean the data. Thus we obtain the subway-busyness table indexed by (POI, time window)

In [34]:
# attach POI to subway data
poi_time_subway = pd.merge(poi_station, station_time_subway, on = 'station_id', how = 'left')

print('The data shape is', poi_time_taxi.shape, '\n')

print('The information of data:')
print(poi_time_subway.dtypes)

The data shape is (2642280, 3) 

The information of data:
poi_id                 int64
station_id            object
timeWindow    datetime64[ns]
riderNum               int32
dtype: object


In [35]:
# drop the rows containing NA
poi_time_subway = poi_time_subway.dropna()

poi_time_subway['poi_id'] = poi_time_subway['poi_id'].astype('int')

# save the usefule colums
poi_time_subway = poi_time_subway[['poi_id', 'timeWindow', 'riderNum']]

print('The data shape is', poi_time_subway.shape, '\n')

print('The information of data:')
print(poi_time_subway.dtypes)

The data shape is (2367998, 3) 

The information of data:
poi_id                 int32
timeWindow    datetime64[ns]
riderNum               int32
dtype: object


# Merge the taxi data and subway data

In [36]:
# drop the duplicated rows
print('The row number before dropping the duplicates:', poi_time_taxi.shape)
poi_time_taxi = poi_time_taxi.drop_duplicates()
print('The row number after dropping the duplicates:', poi_time_taxi.shape, '\n')
poi_time_taxi = poi_time_taxi.reset_index(drop=True)


# drop the duplicated rows
print('The row number before dropping the duplicates:', poi_time_subway.shape)
poi_time_subway = poi_time_subway.drop_duplicates()
print('The row number after dropping the duplicates:', poi_time_subway.shape, '\n')
poi_time_subway = poi_time_subway.reset_index(drop=True)

# merge the taxi trip data and bike count data
temp = pd.merge(poi_time_taxi, poi_time_subway, on=['poi_id', 'timeWindow'], how='inner')
print('The merged data shape before dropping duplicates:', temp.shape)
temp = temp.drop_duplicates()
print('The merged data shape after dropping duplicates:', temp.shape, '\n')
temp = temp.reset_index(drop=True)



print('The merged data columns:')
print(temp.dtypes)

The row number before dropping the duplicates: (2642280, 3)
The row number after dropping the duplicates: (2636486, 3) 

The row number before dropping the duplicates: (2367998, 3)
The row number after dropping the duplicates: (2367880, 3) 

The merged data shape before dropping duplicates: (2388834, 4)
The merged data shape after dropping duplicates: (2388834, 4) 

The merged data columns:
poi_id                 int32
timeWindow    datetime64[ns]
taxi_all               int32
riderNum               int32
dtype: object


# Meerge traffic data with weather data

In [37]:
# load weather data
time_weather = pd.read_excel(path + 'weather.xlsx')

print('The columns in weather data:')
print(time_weather.dtypes)

The columns in weather data:
datetime         datetime64[ns]
temperature             float64
precipitation           float64
weatherCode               int64
windSpeed               float64
dtype: object


In [38]:
# convert the type of column 'weatherCode' to 'int'
time_weather['weatherCode'] = time_weather['weatherCode'].astype('int')

# rename the time column
time_weather.rename(columns={'datetime': 'timeWindow'}, inplace=True)

print('The columns in weather data:')
print(time_weather.dtypes)

The columns in weather data:
timeWindow       datetime64[ns]
temperature             float64
precipitation           float64
weatherCode               int32
windSpeed               float64
dtype: object


In [39]:
print(temp.dtypes)

poi_id                 int32
timeWindow    datetime64[ns]
taxi_all               int32
riderNum               int32
dtype: object


In [40]:
# merge weather data with traffic data
temp = pd.merge(temp, time_weather, on=['timeWindow'], how ='left')

# drop the rows containing NA and duplicated rows
temp = temp.dropna()
temp = temp.drop_duplicates()


# reset index of the data framee
temp = temp.reset_index(drop=True)

print('The shape:', temp.shape, '\n')
print('The columns:')
print(temp.dtypes)

The shape: (2388834, 8) 

The columns:
poi_id                    int32
timeWindow       datetime64[ns]
taxi_all                  int32
riderNum                  int32
temperature             float64
precipitation           float64
weatherCode               int32
windSpeed               float64
dtype: object


# Create calendar-related features

We will construct 'weekday' and 'holiday', and label the time window

In [41]:
# construct weekday column
temp['weekday'] = temp['timeWindow'].dt.strftime('%A')

# construct holiday column
ny_holidays = holidays.US(state='NY')
temp['holiday'] = temp['timeWindow'].apply(lambda x: x in ny_holidays)

# lael the time window
temp['timeWindow'] = temp['timeWindow'].dt.strftime('%H')


print(temp.dtypes, '\n')

print(temp.head)

poi_id             int32
timeWindow        object
taxi_all           int32
riderNum           int32
temperature      float64
precipitation    float64
weatherCode        int32
windSpeed        float64
weekday           object
holiday             bool
dtype: object 

<bound method NDFrame.head of          poi_id timeWindow  taxi_all  riderNum  temperature  precipitation  \
0             1         00        18        62        -10.7            0.0   
1             1         01        13        14        -11.6            0.0   
2             1         02         8        19        -12.5            0.0   
3             1         03         1         7        -12.0            0.0   
4             1         04         6        13        -12.7            0.0   
...         ...        ...       ...       ...          ...            ...   
2388829     229         19         0        37         14.5            5.6   
2388830     229         20         0        39         14.5            4.1   
23

In [42]:
# save the data frame to disck
with open(path + 'trafficWeatherData.pickle', 'wb') as file:
    pickle.dump(temp, file)

