## Notebook Setup

In [3]:
import json
from datetime import time as dt_time
from random import choice

import backoff
import geopandas as gpd
import keplergl as kp
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=10)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
# Enter your API key here to run this section of the notebook
GOOG_API_KEY = "API_KEY"

In [None]:
resta_insp_df = pd.read_csv(
    filepath_or_buffer= "data/raw/DOHMH_New_York_City_Restaurant_Inspection_Results.zip", # Not the same zip that came from the link
    compression="zip",
    usecols=["CAMIS","BORO", 'CUISINE DESCRIPTION', "BUILDING", "STREET", "ZIPCODE"],
    dtype={"CAMIS": int, "BORO":str, 'CUISINE DESCRIPTION':str, "BUILDING": str, "STREET": str, "ZIPCODE": str},
    skip_blank_lines=True,
    on_bad_lines="warn",
)

resta_insp_df.head(5)

## DATA PREPROCESSING

### Geodecoding using the Paid Google Maps API
#### You can skip this section since I have uploaded the preprocessed data to github

I was using the paid Google API for the geocoding of the addresses, so I couldnt test the code multiple times.
Output for this preprocessing step is not being shown

In [None]:
# remove duplicate address
uniq_addr_df = resta_insp_df.drop_duplicates(subset=["CAMIS"]).copy()

#combines all the address fields into one, I also added some extra info to help the geocoder
uniq_addr_df['full_addr'] = uniq_addr_df['BUILDING'] + ' ' + uniq_addr_df['STREET'] + ', New York, NY ' + uniq_addr_df['ZIPCODE'] + ', USA'

uniq_addr_df.head(5)

In [None]:
#works to retry if any issue happens with the geocoding
@backoff.on_exception(
    backoff.expo,
    exception=Exception,
    on_backoff=lambda details: print(
        f"Backing off {details['wait']}s after {details['tries']} tries"
    ),
    max_tries=10,
    on_giveup=lambda e: print(f"Give up: {e}"),
)

#function to geocode the address
# I used geopandas and googlev3 to geocode the address,
# I originally plan to use the free geocoder providers, which is why there is random.choice
# but I found that the paid googlev3 provider was the easiest to use
 
def get_lat_lon(addr):
    resp = gpd.tools.geocode(
        addr,
        provider=choice(
            [
                "googlev3"
                # "arcgis",
                # "banfrance",
                # "databc",
                # "ignfrance",
                # "nominatim",
                #  "openmapquest",
                # "photon",
            ]
        ),
        user_agent="nyc_taxi_rest",
        api_key=GOOG_API_KEY,
    )
    return resp.loc[0, "geometry"], resp.loc[0, "address"]

In [None]:
uniq_addr_df[['point', 'goog_addr']] = uniq_addr_df['full_addr'].parallel_apply(get_lat_lon).to_list()

uniq_addr_df.head(5)

In [None]:
uniq_addr_df.dropna(subset=['goog_addr'], inplace=True)

# get lat and lon from point
uniq_addr_df[['lat', 'lon']] = uniq_addr_df['point'].apply(lambda x: pd.Series([x.y, x.x]))

uniq_addr_df.drop(columns=['point'], inplace=True)

uniq_addr_df.head(5)

In [None]:
uniq_addr_df.to_csv('data/proc/uniq_addr.csv.zip', index=False, compression='zip')


### Geocoded Restaurant Addresses
#### START FROM HERE to use preprocessed data

In [4]:
proc_df = pd.read_csv('data/proc/uniq_addr.csv.zip', compression='zip')

proc_df.head(5)

Unnamed: 0,CAMIS,BUILDING,STREET,ZIPCODE,full_addr,goog_addr,lat,lon,BORO,CUISINE DESCRIPTION
0,40511702,635,SECOND AVENUE,10016,"635 SECOND AVENUE, New York, NY 10016, USA","635 2nd Ave, New York, NY 10016, USA",40.745295,-73.975777,MANHATTAN,Italian
1,50046354,2507,BROADWAY,11106,"2507 BROADWAY, New York, NY 11106, USA","2507 Broadway, New York, NY 10025, USA",40.793126,-73.973306,QUEENS,Italian
2,50061389,11C,HOLDEN BLVD,10314,"11C HOLDEN BLVD, New York, NY 10314, USA","11 Holden Blvd, Staten Island, NY 10314, USA",40.604667,-74.121197,STATEN ISLAND,Chinese
3,41516263,8015,5 AVENUE,11209,"8015 5 AVENUE, New York, NY 11209, USA","8015 5th Ave, Brooklyn, NY 11209, USA",40.625275,-74.024111,BROOKLYN,American
4,50015855,4339,MAIN ST,11355,"4339 MAIN ST, New York, NY 11355, USA","43-39 Main St, Flushing, NY 11355, USA",40.75163,-73.825819,QUEENS,Pakistani


In [5]:
geo_res_df = gpd.GeoDataFrame(proc_df, geometry=gpd.points_from_xy(proc_df.lon, proc_df.lat, crs='epsg:4326'))

#EPSG is very important in calculating distances, I choose one for New York in meters
geo_res_df=geo_res_df.to_crs(epsg=32118)
geo_res_df.head(5)

Unnamed: 0,CAMIS,BUILDING,STREET,ZIPCODE,full_addr,goog_addr,lat,lon,BORO,CUISINE DESCRIPTION,geometry
0,40511702,635,SECOND AVENUE,10016,"635 SECOND AVENUE, New York, NY 10016, USA","635 2nd Ave, New York, NY 10016, USA",40.745295,-73.975777,MANHATTAN,Italian,POINT (302045.808 64254.577)
1,50046354,2507,BROADWAY,11106,"2507 BROADWAY, New York, NY 11106, USA","2507 Broadway, New York, NY 10025, USA",40.793126,-73.973306,QUEENS,Italian,POINT (302252.935 69566.211)
2,50061389,11C,HOLDEN BLVD,10314,"11C HOLDEN BLVD, New York, NY 10314, USA","11 Holden Blvd, Staten Island, NY 10314, USA",40.604667,-74.121197,STATEN ISLAND,Chinese,POINT (289742.322 48644.999)
3,41516263,8015,5 AVENUE,11209,"8015 5 AVENUE, New York, NY 11209, USA","8015 5th Ave, Brooklyn, NY 11209, USA",40.625275,-74.024111,BROOKLYN,American,POINT (297959.983 50926.648)
4,50015855,4339,MAIN ST,11355,"4339 MAIN ST, New York, NY 11355, USA","43-39 Main St, Flushing, NY 11355, USA",40.75163,-73.825819,QUEENS,Pakistani,POINT (314709.586 64972.457)


### Taxi Data Preprocessing

In [6]:
taxi_df = pd.read_csv(
    "data/raw/data.zip",
    usecols=["tpep_dropoff_datetime","passenger_count", "tip_amount", "trip_distance", "dropoff_longitude", "dropoff_latitude"],
    parse_dates=["tpep_dropoff_datetime"],
    compression="zip",
)
# convert to EST
taxi_df['tpep_dropoff_datetime'] =taxi_df['tpep_dropoff_datetime'].dt.tz_convert('America/New_York') 
print(taxi_df.dtypes)
taxi_df.head(5)


tpep_dropoff_datetime    datetime64[ns, America/New_York]
passenger_count                                     int64
trip_distance                                     float64
dropoff_longitude                                 float64
dropoff_latitude                                  float64
tip_amount                                        float64
dtype: object


Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,dropoff_longitude,dropoff_latitude,tip_amount
0,2015-01-15 14:23:42-05:00,1,1.59,-73.974785,40.750618,3.25
1,2015-01-15 14:32:00-05:00,1,2.38,-73.983978,40.757889,4.38
2,2015-01-15 14:21:00-05:00,5,2.83,-73.955124,40.786858,0.0
3,2015-01-15 14:28:18-05:00,5,8.33,-73.952713,40.785782,8.08
4,2015-01-15 14:20:36-05:00,1,2.37,-73.98085,40.786083,0.0


In [7]:
# lunch is during 11:30am and 2pm
# dinner is during 5pm and 9pm.

def meal_label(dts):
    if dt_time(11, 30) <= dts.time() <= dt_time(14, 0):
        return "lunch"
    elif dt_time(17, 0) <= dts.time() <= dt_time(21, 0):
        return "dinner"
    else:
        return "other"


In [8]:
taxi_df['meal'] = taxi_df['tpep_dropoff_datetime'].apply(meal_label)

#remove other meals
taxi_df = taxi_df[taxi_df['meal'] != 'other']
taxi_df.head(5)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,dropoff_longitude,dropoff_latitude,tip_amount,meal
242,2015-01-15 12:08:39-05:00,1,8.8,-73.870796,40.773926,11.2,lunch
248,2015-01-15 12:12:21-05:00,1,17.2,-73.778236,40.644943,10.0,lunch
338,2015-01-15 11:31:39-05:00,1,1.3,-73.964592,40.770149,3.95,lunch
340,2015-01-15 12:41:40-05:00,1,17.5,-73.776321,40.645363,11.6,lunch
341,2015-01-15 11:30:30-05:00,1,2.0,-74.005852,40.750015,2.55,lunch


## Link the taxi and restaurant data spatially

In [9]:
geo_taxi_df = gpd.GeoDataFrame(taxi_df, geometry=gpd.points_from_xy(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude), crs="EPSG:4326")
geo_taxi_df = geo_taxi_df.to_crs(epsg=32118)  
geo_taxi_df.head(5)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,dropoff_longitude,dropoff_latitude,tip_amount,meal,geometry
242,2015-01-15 12:08:39-05:00,1,8.8,-73.870796,40.773926,11.2,lunch,POINT (310907.594 67441.776)
248,2015-01-15 12:12:21-05:00,1,17.2,-73.778236,40.644943,10.0,lunch,POINT (318757.891 53134.211)
338,2015-01-15 11:31:39-05:00,1,1.3,-73.964592,40.770149,3.95,lunch,POINT (302989.373 67014.953)
340,2015-01-15 12:41:40-05:00,1,17.5,-73.776321,40.645363,11.6,lunch,POINT (318919.750 53181.220)
341,2015-01-15 11:30:30-05:00,1,2.0,-74.005852,40.750015,2.55,lunch,POINT (299505.808 64778.512)


In [10]:
# join geo_taxi and geo_restaurant data

comb_df = geo_taxi_df.sjoin_nearest(geo_res_df, max_distance=50, distance_col='dist')
comb_df.head(5)

Unnamed: 0,tpep_dropoff_datetime,passenger_count,trip_distance,dropoff_longitude,dropoff_latitude,tip_amount,meal,geometry,index_right,CAMIS,BUILDING,STREET,ZIPCODE,full_addr,goog_addr,lat,lon,BORO,CUISINE DESCRIPTION,dist
338,2015-01-15 11:31:39-05:00,1,1.3,-73.964592,40.770149,3.95,lunch,POINT (302989.373 67014.953),17814,50002506,725,PARK AVE,10021,"725 PARK AVE, New York, NY 10021, USA","725 Park Ave, New York, NY 10021, USA",40.769851,-73.964323,MANHATTAN,Asian,40.150588
12957,2015-01-15 18:53:32-05:00,1,0.3,-73.964729,40.769615,0.0,dinner,POINT (302977.802 66955.642),17814,50002506,725,PARK AVE,10021,"725 PARK AVE, New York, NY 10021, USA","725 Park Ave, New York, NY 10021, USA",40.769851,-73.964323,MANHATTAN,Asian,43.110574
21280,2015-01-15 17:50:12-05:00,1,1.48,-73.964539,40.770241,1.7,dinner,POINT (302993.878 67025.122),17814,50002506,725,PARK AVE,10021,"725 PARK AVE, New York, NY 10021, USA","725 Park Ave, New York, NY 10021, USA",40.769851,-73.964323,MANHATTAN,Asian,46.959409
23497,2015-01-15 13:34:20-05:00,1,1.18,-73.964439,40.770107,0.0,lunch,POINT (303002.257 67010.299),17814,50002506,725,PARK AVE,10021,"725 PARK AVE, New York, NY 10021, USA","725 Park Ave, New York, NY 10021, USA",40.769851,-73.964323,MANHATTAN,Asian,30.1145
25969,2015-01-15 17:26:16-05:00,4,1.42,-73.964462,40.770012,1.0,dinner,POINT (303000.329 66999.708),17814,50002506,725,PARK AVE,10021,"725 PARK AVE, New York, NY 10021, USA","725 Park Ave, New York, NY 10021, USA",40.769851,-73.964323,MANHATTAN,Asian,21.385366


## Create An Exploratory Map

I used Kepler.GL and set up kepler so that the data shows the path as an arc from the taxi dropoff location to the closest restaurant in a 50 meter radius
I added the kepler_map.html file so you can view the map without having to run the notebook

In [20]:
comb_df['tpep_dropoff_datetime'] = comb_df['tpep_dropoff_datetime'].astype(str)
df_map = kp.KeplerGl(height=850, width=650)
df_map.add_data(data=comb_df, name="NYC Taxi Restaurant Inspections")
df_map.config = json.load(open("data/settings/kepler_config.json", "r"))
df_map

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': 'rurmdio', 'type': …

In [22]:
#save the map to html file so you can view without running the notebook its called kepler_map.html
# df_map.save_to_html(file_name="kepler_map.html")

Map saved to kepler_map.html!


In [None]:
# only needed when saving kepler config

# with open('data/settings/kepler_config.json', 'w') as f:
#     f.write(json.dumps(df_map.config, indent=2))

## Answers to Exploratory Data Analysis Questions
(a)

In [39]:
# How far do people travel based on different types of cuisine (“CUISINE DESCRIPTION”)?
comb_res = comb_df.groupby(["CUISINE DESCRIPTION"]).agg({"trip_distance": "sum"})
comb_res['perc'] = comb_res['trip_distance'] / comb_res['trip_distance'].sum()
comb_res.sort_values(by="perc", ascending=False).head(10)

Unnamed: 0_level_0,trip_distance,perc
CUISINE DESCRIPTION,Unnamed: 1_level_1,Unnamed: 2_level_1
American,26270.52,0.344554
CafÃ©/Coffee/Tea,7172.25,0.094068
Italian,4058.74,0.053233
Other,3483.35,0.045686
Japanese,2529.18,0.033172
Chinese,2352.86,0.030859
Pizza,2346.21,0.030772
French,2074.09,0.027203
Mexican,1858.21,0.024372
Sandwiches/Salads/Mixed Buffet,1404.51,0.018421


In [26]:
# How does this differ based on the borough where the restaurant is located (“BORO”, one of 5 large NYC neighborhoods)?
comb_res = comb_df.groupby(["BORO", "CUISINE DESCRIPTION"]).agg({"trip_distance": "sum"})
comb_res["perc"] = comb_res.groupby(level=0, group_keys=False).apply(
    lambda x: 100 * x / x.sum()
)
comb_res.reset_index().groupby("BORO").apply(
    lambda x: x.nlargest(10, ["perc", "trip_distance"])
).reset_index(drop=True)

Unnamed: 0,BORO,CUISINE DESCRIPTION,trip_distance,perc
0,BRONX,American,87.5,16.026228
1,BRONX,Spanish,71.36,13.070076
2,BRONX,Chinese,67.56,12.37408
3,BRONX,Pizza,58.95,10.797099
4,BRONX,Caribbean,56.95,10.430785
5,BRONX,"Latin (Cuban, Dominican, Puerto Rican, South &...",37.86,6.93432
6,BRONX,Pizza/Italian,28.47,5.214477
7,BRONX,Other,24.46,4.480018
8,BRONX,Donuts,19.9,3.644822
9,BRONX,Mexican,16.54,3.029415


In [42]:
# How does this differ by meal time?
comb_res = comb_df.groupby(["meal","CUISINE DESCRIPTION"]).agg({"trip_distance": "sum"})
comb_res["perc"] = comb_res.groupby(level=0, group_keys=False).apply(
    lambda x: 100 * x / x.sum()
)
comb_res.reset_index().groupby("meal").apply(
    lambda x: x.nlargest(15, ["perc", "trip_distance"])
).reset_index(drop=True)

Unnamed: 0,meal,CUISINE DESCRIPTION,trip_distance,perc
0,dinner,American,15073.67,32.673569
1,dinner,CafÃ©/Coffee/Tea,4333.06,9.392307
2,dinner,Italian,2504.03,5.427716
3,dinner,Other,2124.53,4.605115
4,dinner,Chinese,1764.74,3.825237
5,dinner,Pizza,1578.02,3.420504
6,dinner,Japanese,1471.8,3.190262
7,dinner,Mexican,1263.53,2.738818
8,dinner,French,1237.48,2.682352
9,dinner,Donuts,895.82,1.941772


#### (a) Conclusion
Based on the assumption that all taxi trips between lunchtime (11:30am - 2pm) and dinnertime (5pm - 9pm) were made to restaurants, 
it appears that American cuisine is the most popular type of food people travel for in taxis. This holds true across boroughs and meal times, coffee/tea being a distant second in most cases.

### (b)

In [27]:
# what is the average tip amount for each cuisine type?

comb_res = comb_df.groupby(["CUISINE DESCRIPTION"]).agg({"tip_amount": "mean"}).sort_values(by="tip_amount", ascending=False).nlargest(30, "tip_amount")
comb_res


Unnamed: 0_level_0,tip_amount
CUISINE DESCRIPTION,Unnamed: 1_level_1
"Bottled beverages, including water, sodas, juices, etc.",3.570865
Bangladeshi,2.96
Portuguese,2.901
Scandinavian,2.883333
Soul Food,2.8175
Creole,2.81
Eastern European,2.808293
Polish,2.612
Russian,2.564667
Caribbean,2.409811


In [43]:
# How does this differ by borough?
comb_df.groupby(["BORO", "CUISINE DESCRIPTION"]).agg({"tip_amount": "mean"}).groupby(
    "BORO"
)["tip_amount"].nlargest(7).reset_index(level=0, drop=True)

BORO           CUISINE DESCRIPTION                                    
BRONX          Steak                                                       6.000000
               American                                                    4.321111
               Mexican                                                     3.750000
               Caribbean                                                   3.730000
               Pizza/Italian                                               3.400000
               Pizza                                                       2.595000
               Chinese                                                     1.540000
BROOKLYN       Eastern European                                            8.040000
               Russian                                                     6.950000
               Peruvian                                                    4.650000
               English                                                     3.880000
     

In [44]:
# How does this differ by meal time?
comb_df.groupby(["meal", "CUISINE DESCRIPTION"]).agg({"tip_amount": "mean"}).groupby(
    "meal"
)["tip_amount"].nlargest(15).reset_index(level=0, drop=True)

meal    CUISINE DESCRIPTION                                             
dinner  Portuguese                                                          3.416667
        Eastern European                                                    3.271875
        Bangladeshi                                                         3.246667
        Creole                                                              2.810000
        Polish                                                              2.594615
        Caribbean                                                           2.532619
        Russian                                                             2.415000
        Not Listed/Not Applicable                                           2.272222
        Armenian                                                            2.030769
        Brazilian                                                           2.005714
        Latin (Cuban, Dominican, Puerto Rican, South & Central American)    1

In [46]:
# by number of passengers in the taxi?

comb_df.groupby(["passenger_count", "CUISINE DESCRIPTION"]).agg(
    {"tip_amount": "mean"}
).groupby("passenger_count")["tip_amount"].nlargest(5).reset_index(level=0, drop=True)

passenger_count  CUISINE DESCRIPTION                                    
0                Italian                                                    2.850000
                 Other                                                      2.850000
                 French                                                     2.350000
                 Donuts                                                     2.000000
                 Pizza                                                      1.950000
1                Creole                                                     5.620000
                 Soul Food                                                  3.907273
                 Bottled beverages, including water, sodas, juices, etc.    3.817692
                 Ethiopian                                                  3.238571
                 Russian                                                    3.007000
2                Hotdogs                                                    6

#### (b) Conclusion
Based on the data, it appears that people tend to tip more for "Bottled beverages..." as compared to other cuisine types. The borough of Queens has a higher average tipping rate for its top 7 cuisine types than the other boroughs. Additionally, on average, people tend to tip more for lunch than for dinner across their top 15 cuisine types. Lastly, there seems to be a positive correlation between the number of passengers and the average tipping rate, with larger groups tending to tip more. There are some zero passenger count that needs to be further investigated 

## Predictive Model Setup
of restaurant cuisine type to be visited by a taxi rider based on information present in the two datasets



* Infrastructure & Deployment: I strongly recommend using a Docker container to facilitate seamless and efficient scaling and deployment across multiple cloud services such as GCP, AWS, Azure, and GitHub. It's worth noting that this repository is already Dockerized, making it easier to set up the necessary environment. Additionally, we can use MLflow to streamline the machine learning lifecycle and ensure better version control. By adopting these tools, we can optimize our budget, reduce the chances of making mistakes, and minimize the time spent on debugging environment setups.
* Data: We can start by analyzing passenger counts and time of day/meal using the current datasets. Additionally, we could explore other datasets such as the types of businesses at the starting point of the ride, the average income for the starting and endpoint of the ride, weather data, the average nutritional value by cuisine type in the area, and the average item price by cuisine type in the area. While some of these datasets are readily available, others may require verification.
* Model:  As the datasets are not expected to be too large, we can use standard machine learning techniques like XGBoost. To ensure transparency and interpretability, we can use explainable machine learning methods that help us understand the factors leading to the model's predictions.