In [None]:
import numpy as np
import os
import pandas as pd

In [None]:
nRowsRead = None  # specify 'None' if want to read whole file
df = pd.read_csv(
    "/kaggle/input/tripadvisor-european-restaurants/tripadvisor_european_restaurants.csv",
    delimiter=",",
    nrows=nRowsRead,
)
nRow, nCol = df.shape
print(f"There are {nRow} rows and {nCol} columns")

In [None]:
df.info()

In [29]:
import json

def parse_and_flatten_open_hours(row):
    if not isinstance(row, str) or not row.strip():
        return []

    def time_range_to_float_range(time_range_str):
        start_str, end_str = time_range_str.split('-')
        def to_float(t):
            h, m = map(int, t.split(':'))
            return h + m / 60
        return {'open': to_float(start_str), 'close': to_float(end_str)}

    try:
        data = json.loads(row)
        flattened = []
        for day, ranges in data.items():
            for r in ranges:
                float_range = time_range_to_float_range(r)
                flattened.append({
                    'day': day,
                    'open': float_range['open'],
                    'close': float_range['close']
                })
        return flattened
    except Exception:
        return []

def transform_to_bool(row):
    if row

In [None]:
dropped = df
dropped = dropped.drop(
    [
        "keywords",
        "original_location",
        "special_diets",
        "working_shifts_per_week",
        "claimed",
        "popularity_generic",
        "open_hours_per_week",
        "default_language",
    ],
    axis=1,
)
dropped = dropped.dropna(
    subset=[
        "original_open_hours",
        "latitude",
        "longitude",
        "price_range",
        "meals",
        "cuisines",
        "avg_rating",
        "food",
    ]
)

dropped["location"] = dropped[["latitude", "longitude"]].values.tolist()
dropped["features"] = dropped["features"].apply(
    lambda x: x.split(", ") if pd.notnull(x) else []
)
dropped["meals"] = dropped["meals"].apply(
    lambda x: x.split(", ") if pd.notnull(x) else []
)
dropped["cuisines"] = dropped["cuisines"].apply(
    lambda x: x.split(", ") if pd.notnull(x) else []
)
dropped["top_tags"] = dropped["top_tags"].apply(
    lambda x: x.split(", ") if pd.notnull(x) else []
)
dropped["awards"] = dropped["awards"].apply(
    lambda x: x.split(", ") if pd.notnull(x) else []
)
dropped["original_open_hours"] = dropped["original_open_hours"].apply(
    parse_and_flatten_open_hours
)
dropped[["price_min", "price_max"]] = (
    dropped["price_range"]
    .str.extract(r"€?(\d+(?:\.\d+)?)\-€?(\d+(?:\.\d+)?)")
    .astype(float)
)
dropped["vegetarian_friendly"] = dropped["vegetarian_friendly"].map(
    {"Y": True, "N": False}
)
dropped["gluten_free"] = dropped["gluten_free"].map({"Y": True, "N": False})
dropped["total_reviews_count"] = dropped["total_reviews_count"].fillna(0).astype(int)
dropped["open_days_per_week"] = dropped["open_days_per_week"].astype("Int64")

dropped.rename(columns={"original_open_hours": "open_hours"}, inplace=True)
dropped = dropped.drop(
    ["latitude", "longitude", "price_range", "price_level", "vegan_options"], axis=1
)
dropped = dropped.dropna(subset=["price_min"])

print(dropped.info())
print(dropped["country"].value_counts())
dropped.head(5)

In [None]:
with open(f"/kaggle/working/restaurants.jsonl", "w") as f:
    jsonl = dropped.to_json(orient="records", lines=True)
    f.write(jsonl)