# Documentation

## Interesting metrics

- Average, Min, Max operating hours
- Earliest starting time
- Latest ending time
- Bucket to breakfast, lunch, dinner, supper
- Max closed day of week
- Group based on name similarity
- Group based on when they operate
- Missing ending time
- Open on weekends

## Use Cases

- What's open right now 
- Search based on name of restaurant (exact and fuzzy)
- Filter based on day of week, weekends , operating time

## Improvements

- Error handling when parsing CSV file
- Write metrics to DuckDB and use it for querying
- Timezone handling
- Group similar input together because due to mishandling

## Scalable Solution

- Upload CSV files to S3
- Run flink job to watch files on S3 directory
- Incremental update to Elasticsearch / Druid 
- Superset on Elasticsearch / Druid

## Database Access Pattern

- Store day of week as bitstring to indicate open or close


# Setup

In [None]:
# !pip install "modin[all]"
# !pip install pandas==1.4.2
# !pip install ipyfilechooser
# !pip install duckdb==0.3.4
# !pip install pyarrow
# !pip install swifter
# !pip install difflib
# !pip install fuzzywuzzy
# !pip install python-Levenshtein
# !pip install panel
# !pip install plotly
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import

In [None]:
import modin.pandas as pd
import datetime
import itertools
from ipyfilechooser import FileChooser
import duckdb
import pyarrow as pa
from pyarrow import csv
import pyarrow.dataset as ds
import swifter
from collections import Counter
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import panel as pn
import plotly.express as px
import plotly.graph_objects as go
pn.extension('tabulator', css_files=[pn.io.resources.CSS_URLS['font-awesome']])

# Widgets

## CSV

In [None]:
# Create and display a FileChooser widget
csv_chooser = FileChooser('.')
csv_chooser.title = '<b>Upload CSV</b>'
csv_chooser.filter_pattern = "*.csv"
display(csv_chooser)

## DB

In [None]:
# Create and display a FileChooser widget
db_chooser = FileChooser('.')
db_chooser.title = '<b>Upload DB (optional)</b>'
db_chooser.filter_pattern = "*.csv"
display(db_chooser)

# Input Resolution

In [None]:
DEFAULT_FILE_PATH = './data/input/dinning_places_open_hrs.csv'
input_file_path = csv_chooser.selected or DEFAULT_FILE_PATH
# db_file_path = db_chooser.selected or 'duck.db'
# db_conn = duckdb.connect(db_file_path)
read_options = csv.ReadOptions(
               column_names=["dining_place_name", "opening_time"],
               skip_rows=1)
input_csv = csv.read_csv(input_file_path, read_options)
df = input_csv.to_pandas()
# Create table based on raw Panda Dataframe
# db_conn.execute('CREATE TABLE IF NOT EXISTS raw_dinning_places AS SELECT * FROM df')

# Functions

In [None]:
dow_map = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

def matchDiningPlace(name, search_text):
    for i in search_text:
        if i == '' or len(i) < 4:
            return True
        score = fuzz.token_sort_ratio(name, search_text)
        # print(f"Place: {name}, Search: {i}, Score: {score}")
        if score > 40:
            return True
    return False

def findDiningPlace(names, df):
    matched_names = [process.extractOne(name, df['dining_place_name'].values)[0] for name in names]
    return df[df['dining_place_name'].isin(matched_names)]

def parse_timing(timing):
    parsed_timing = []
    try:
        if (not timing):
            return ""
        time = timing.strip().split('-')
        for t in time:
            parsed_time = t.strip().split(' ')
            unit_time = parsed_time[0]
            hour = unit_time.split(":")[0]
            minute = "00"
            if (":" in unit_time):
                minute = unit_time.split(":")[-1]
            hasPM = len(parsed_time) > 1 and ("pm" in parsed_time[1] or "PM" in parsed_time[1])
            if (hasPM and int(hour) != 12):
                hour = str(int(hour) + 12)
            operating_timing = hour.zfill(2) + minute if hour else ''
            parsed_timing.append(operating_timing)
    except Exception as err:
        print("Error parsing timing", err)
    finally:
        return '-'.join(parsed_timing)
            

def process_section(section, opening_time):
    try:
        operating_time = section[-1].strip().split(" ", 1)[-1]
        # print("Time", operating_time)
        for i in section:
            days = i.strip().split(" ")[0].split("-")
            # print(days)
            if (len(days) > 1) and operating_time:
                start_idx = dow_map.index(days[0].lower())
                end_idx = dow_map.index(days[-1].lower()) + 1
                opening_time[start_idx:end_idx] = [operating_time] * (end_idx - start_idx)
            elif operating_time:
                opening_time[dow_map.index(days[0].lower())] = operating_time
        return opening_time
    except Exception as err:
        print("Error processing section", err)
        return opening_time
    
    
def parse_time(row):
    # print("Row", row)
    operating_time = [''] * 7
    blocks = row.strip().split("/")
    sections = list(itertools.chain(*[x.strip().split(",") for x in blocks]))
    sections = [process_section(x.strip().split(","), operating_time) for x in blocks]
    return sections[-1]

def parse_start_end_time(period):
    ranges = period.split('-')
    return [ranges[0], ranges[1]] if len(ranges) > 1 else [ranges[0], '']

def update_operating_time(df, dow_map):
    for idx, day in enumerate(dow_map):
        df[f"{day}_start_time"] = df.swifter.apply(lambda x: parse_start_end_time(x['parsed_timing'].split(',')[idx])[0], axis=1)
        df[f"{day}_end_time"] = df.swifter.apply(lambda x: parse_start_end_time(x['parsed_timing'].split(',')[idx])[1], axis=1)
    return df 

def update_operating_time_metrics(df):
    for row in df.itertuples():
        operating_time_freq = Counter(getattr(row, 'parsed_timing').split(','))
        most_common_operating_time = operating_time_freq.most_common(1)[0][0]
        start_time, end_time = parse_start_end_time(most_common_operating_time)
        # print(getattr(row, 'dining_place_name'), start_time, end_time)
        start_datetime = datetime.datetime.strptime(start_time, '%H%M')
        end_datetime = datetime.datetime.strptime('0000' if end_time == '' else end_time, '%H%M')
        if (end_time != '' and int(end_time) < int(start_time)):
            start_datetime = start_datetime.replace(day=1)
            end_datetime = end_datetime.replace(day=2)
        duration_in_hrs = (end_datetime - start_datetime).total_seconds() / 3600
        breakfast = int(start_time) >= int('0600') and int(start_time) < int('1000') and int(end_time) >= int('1100')
        lunch = int(start_time) < int('1300') and int(end_time) > int('1400')
        dinner = int(start_time) < int('1900') and int(end_time) > int('2100')
        supper = int(end_time) >= int('2200') or (int(end_time) < int(start_time))
        df.at[row.Index, 'most_common_operating_time'] = most_common_operating_time
        df.at[row.Index, 'duration_in_hrs'] = duration_in_hrs
        df.at[row.Index, 'breakfast'] = breakfast
        df.at[row.Index, 'lunch'] = lunch
        df.at[row.Index, 'dinner'] = dinner
        df.at[row.Index, 'supper'] = supper
        
def filter_row(row, search_text, is_open_now, filtered_day, start_time, end_time):
    today = datetime.datetime.today()
    current_dow = today.weekday()
    current_time = today.strftime('%H%M')
    timing = row['parsed_timing']
    matched_day = True
    matched_time = True
    if (is_open_now):
        filtered_day = [current_dow]
        start_time = current_time
    if len(filtered_day) > 0:
        open_dining = [timing.split(',')[x] for x in filtered_day]
        matched_day = '' not in open_dining
        # print('Matched Day', matched_day, filtered_day)
        if (matched_day):
            start_can = True
            end_can = True
            for i in filtered_day:
                opening_time = timing.split(',')[i]
                start = opening_time.split('-')[0]
                end = opening_time.split('-')[1] if len(opening_time.split('-')) > 1 else ''
                # print(parse_time(row['opening_time']), row['opening_time'], timing, opening_time, start, start_time)
                if (start_time != '' and start != '' and start_time < start):
                    start_can =  False
                if (end_time != '' and end != '' and end_time > end):
                    end_can = False
            matched_time = start_can and end_can
        else:
            return False
    # print(matched_day, matched_time)
    return matched_day and matched_time and matchDiningPlace(row['dining_place_name'], search_text.split(','))

def draw_dining_hours_chart(df):  
    # Count number of empty start time and end time for particular day of week
    closed_days = get_closed_days(df, dow_map)
    max(closed_days, key=closed_days.get)
    # Calculate max duration_in_hrs
    df['duration_in_hrs'].max()
    # Calculate min duration_in_hrs
    df['duration_in_hrs'].min()
    # Calculate avg duration_in_hrs
    df['duration_in_hrs'].mean()
    # Number of restaurants open for breakfast, lunch, dinner, supper
    bfast_count = df['breakfast'].value_counts()
    lunch_count = df['lunch'].value_counts()
    dinner_count = df['dinner'].value_counts()
    supper_count = df['supper'].value_counts()
    stats_df = pd.DataFrame([['Breakfast', bfast_count], ['Lunch', lunch_count], ['Dinner', dinner_count], ['Supper', supper_count]], columns=['name', 'count'])
    fig = px.pie(df, values='count', names='name', title='Dinning Hours')
    stats = go.FigureWidget(fig)
    return stats
    
def render_table():
    current_time = is_open_now.value
    search_text = restaurant_search.value
    filtered_day = [dow_map.index(x.lower()) for x in list(day_selector.value)]
    filtered_start_time = start_time_picker.value.strftime("%H%M") if start_time_picker.value else ""
    filtered_end_time = end_time_picker.value.strftime("%H%M") if end_time_picker.value else ""
    filtered_rows = df.apply(lambda x: filter_row(x, search_text, is_open_now.value, filtered_day, filtered_start_time, filtered_end_time), axis=1)
    filtered_df = df[filtered_rows]
    df_widget = pn.widgets.Tabulator(filtered_df[['dining_place_name', 'opening_time']], show_index=False, pagination='remote', page_size=20)
    # summary_widget = draw_dining_hours_chart(filtered_df)
    display(df_widget)
    # display(summary_widget)
    
def get_closed_days(df, dow_map):
    return {i: len(df.loc[(df[f"{i}_start_time"] == "") & (df[f"{i}_start_time"] == "")]) for i in dow_map}
        

# Processing

In [None]:
%%capture 
# Attempt 1: Parse opening_time to duration for each day of 
df['parsed_timing'] = df.swifter.apply(lambda x: ','.join([parse_timing(i) for i in parse_time(x['opening_time'])]), axis=1)
df['opening_days'] = df.swifter.apply(lambda x: ''.join(['0' if i == "" else '1' for i in x['parsed_timing'].split(',')]), axis=1)
# Generate start time and end time for each day of week
df = update_operating_time(df, dow_map)
df = update_operating_time_metrics(df)

In [None]:
# Write output
df.to_csv('./data/output/dinning_places_open_hrs_processed.csv', index=False)

# Dashboard

In [None]:
import ipywidgets as widgets
import ipydatetime

df_widget = pn.widgets.Tabulator(df[['dining_place_name', 'opening_time']], show_index=False, pagination='remote', page_size=20)
output = widgets.Output(layout={"border":"1px solid green"})
output.append_display_data(df_widget)

day_selector = widgets.SelectMultiple(
    options=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
    value=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
    description='Day of Week',
    disabled=False
)
start_time_picker = ipydatetime.TimePicker()
end_time_picker = ipydatetime.TimePicker()
is_open_now = widgets.Checkbox(
    value=False,
    description="Show me what's open now",
    disabled=False,
    indent=False
)
restaurant_search = widgets.Text(
    value='',
    placeholder='Separate your search keywords with ,',
    description='Search:',
    disabled=False
)

layout = widgets.VBox([
    widgets.HBox([restaurant_search, is_open_now]),
    widgets.HBox([
        day_selector,
        widgets.VBox([widgets.Label('Start Time'), start_time_picker]),
        widgets.VBox([widgets.Label('End Time'), end_time_picker])
    ])
])

def response(change):
    output.clear_output()
    with output:
        render_table()

restaurant_search.observe(response, names="value")
is_open_now.observe(response, names="value")
day_selector.observe(response, names="value") 
start_time_picker.observe(response, names="value")
end_time_picker.observe(response, names="value")
display(layout)

In [None]:
display(output)