In [1]:
import pandas as pd
import kagglehub
import numpy as np

# Download latest version
path = kagglehub.dataset_download("popoandrew/restaurant-week-2018")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/popoandrew/restaurant-week-2018?dataset_version_number=2...


100%|██████████| 104k/104k [00:00<00:00, 11.9MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/popoandrew/restaurant-week-2018/versions/2





In [2]:
df = pd.read_csv("/root/.cache/kagglehub/datasets/popoandrew/restaurant-week-2018/versions/2/restaurant_week_2018_final.csv")

In [3]:
#Get desired features
print(df.columns)
columns = ['name','average_review', 'review_count', 'star_1', 'star_2', 'star_3', 'star_4', 'star_5', 'price_range', 'street_address', 'latitude', 'longitude', 'postal_code', 'restaurant_type']
newdf = df.loc[:,columns].copy()


Index(['name', 'street_address', 'google_map', 'review_count', 'phone',
       'website', 'restaurant_type', 'average_review', 'food_review',
       'service_review', 'ambience_review', 'value_review', 'price_range',
       'star_1', 'star_2', 'star_3', 'star_4', 'star_5', 'description',
       'restaurant_main_type', 'latitude', 'longitude', 'postal_code'],
      dtype='object')


In [5]:
#Check and fix missing values
print(newdf.isna().sum())
newdf.loc[346,"postal_code"] = 11209


#Fix dtype
newdf["postal_code"] = newdf["postal_code"].astype(np.int64)

name               0
average_review     0
review_count       0
star_1             0
star_2             0
star_3             0
star_4             0
star_5             0
price_range        0
street_address     0
latitude           0
longitude          0
postal_code        0
restaurant_type    0
dtype: int64


In [6]:
rate_count_df = newdf.loc[:,['star_5', 'star_4', 'star_3', 'star_2', 'star_1']]
rate_count_df.columns = ['Rating 5', 'Rating 4', 'Rating 3', 'Rating 2', 'Rating 1']
rate_count_dict = rate_count_df.T.to_dict()
keys = rate_count_dict.keys()
dict_list = []
for key in keys:
  dict_list.append(str(rate_count_dict[key]))
dict_list_df = pd.DataFrame(dict_list)
dict_list_df

Unnamed: 0,0
0,"{'Rating 5': 62, 'Rating 4': 24, 'Rating 3': 5..."
1,"{'Rating 5': 72, 'Rating 4': 19, 'Rating 3': 6..."
2,"{'Rating 5': 77, 'Rating 4': 15, 'Rating 3': 5..."
3,"{'Rating 5': 74, 'Rating 4': 15, 'Rating 3': 6..."
4,"{'Rating 5': 60, 'Rating 4': 27, 'Rating 3': 9..."
...,...
343,"{'Rating 5': 73, 'Rating 4': 19, 'Rating 3': 3..."
344,"{'Rating 5': 62, 'Rating 4': 19, 'Rating 3': 1..."
345,"{'Rating 5': 43, 'Rating 4': 22, 'Rating 3': 1..."
346,"{'Rating 5': 62, 'Rating 4': 33, 'Rating 3': 5..."


In [15]:
final_df = pd.concat([newdf, dict_list_df], axis = 1)
final_df.columns = [          'name', 'average_review',   'review_count',         'star_1',
               'star_2',         'star_3',         'star_4',         'star_5',
          'price_range', 'street_address',       'latitude',      'longitude',
          'postal_code','restaurant_type',                'Detailed Ratings']
#Reorder columns to match other data
final_df = final_df.loc[:,['name','average_review','review_count',
          'price_range','street_address',
           'restaurant_type']]

In [16]:
final_df

Unnamed: 0,name,average_review,review_count,price_range,street_address,restaurant_type
0,Smoke Jazz and Supper Club,4.421550,2155,$31 to $50,"2751 Broadway New York, NY 10025",Contemporary American
1,Tavern on the Green,4.670290,7029,$31 to $50,"1 Tavern on the Green New York, NY 10023",American
2,ABC Kitchen,4.760310,6031,$31 to $50,"35 East 18th Street New York, NY 10003",Contemporary American
3,Catch New York,4.548300,4830,$31 to $50,"21 Ninth Avenue New York, NY 10014",Seafood
4,Becco,4.418139,18139,$30 and under,"355 West 46th Street New York, NY 10010",Italian
...,...,...,...,...,...,...
343,ATRIO Wine Bar & Restaurant,4.621500,215,$31 to $50,"102 North End Avenue New York, NY 10282",Contemporary American
344,Bobby Van's Steakhouse - Broad Street,4.476800,768,$31 to $50,"25 Broad Street New York, NY 10004",Steakhouse
345,Brasserie Seoul,3.820000,20,$30 and under,"300 Schermerhorn St. Brooklyn, NY 11217",Contemporary French
346,Greenhouse Cafe,4.686000,86,$30 and under,"7717 3rd Avenue Brooklyn, NY 11209-3001",Continental


In [17]:
final_df.to_csv('nyc_restaurant_week_2018_cleaned.csv')