In [36]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import plotly
import plotly.express as px
from IPython.display import IFrame

In [37]:
restaurants = pd.read_csv('D:/2020-XTern-DS.csv') #Read in data

In [38]:
no_data = restaurants['Rating'].isin(['-', 'NEW', 'Opening Soon']) | restaurants['Reviews'].isin(['-', 'NEW', 'Opening Soon']) #split up data between new and est. restaurants
new_set = restaurants[restaurants['Rating'].isin(['-', 'NEW', 'Opening Soon'])]
main_set = restaurants[~no_data]

In [39]:
main_set = main_set.sort_values(by='Rating', ascending=False) #Main set with all restaurants

In [40]:
#Quick glance at the data set showed that main cuisines were Indian, Chinese, and others, so sort the data as such.
indian = main_set[main_set['Cuisines'].str.contains('Indian')]
chinese = main_set[main_set['Cuisines'].str.contains('Chinese')]
other = main_set[~main_set['Cuisines'].str.contains('Indian')]
other = other[~main_set['Cuisines'].str.contains('Chinese')]


Boolean Series key will be reindexed to match DataFrame index.



In [41]:
#clears the data of new restaurants (double checks)
indian = indian[~no_data]
chinese = chinese[~no_data]
other = other[~no_data]


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [42]:
#Converts certain columns into a more usable format (i.e. string to int or float)
def cooktime(input):
    return int(input[:2])

indian.loc[:,'Cook_Time'] = indian['Cook_Time'].apply(cooktime)
chinese.loc[:,'Cook_Time'] = chinese['Cook_Time'].apply(cooktime)
other.loc[:,'Cook_Time'] = other['Cook_Time'].apply(cooktime)
new_set.loc[:,'Cook_Time'] = new_set['Cook_Time'].apply(cooktime)
main_set.loc[:,'Cook_Time'] = main_set['Cook_Time'].apply(cooktime)


def money_to_float(input):
    return float(input[1:])
indian.loc[:,'Minimum_Order'] = indian['Minimum_Order'].apply(money_to_float)
chinese.loc[:,'Minimum_Order'] = chinese['Minimum_Order'].apply(money_to_float)
other.loc[:,'Minimum_Order'] = other['Minimum_Order'].apply(money_to_float)

indian.loc[:,'Reviews'] = indian['Reviews'].apply(lambda x: int(x))
chinese.loc[:,'Reviews'] = chinese['Reviews'].apply(lambda x: int(x))
other.loc[:,'Reviews'] = other['Reviews'].apply(lambda x: int(x))
main_set.loc[:,'Reviews'] = main_set['Reviews'].apply(lambda x: int(x))

In [43]:
indian['Minimum_Order'].mean()
chinese['Minimum_Order'].mean()
#No significant difference in minimum order amount

55.64671814671814

In [44]:
#Main points of analysis: graphed it by location, set the color to how long cook time was
#and set the size to the number of votes/ratings, 
indian_graph = px.scatter(indian, title="Indian Restaurants",x="Latitude", y="Longitude", color="Cook_Time", size="Reviews", hover_data=["Rating"])
chinese_graph = px.scatter(chinese, title="Chinese Restaurants", x="Latitude", y="Longitude", color="Cook_Time", size="Reviews", hover_data=["Rating"])
other_graph = px.scatter(other, title="All Other Restaurants", x="Latitude", y="Longitude", color="Cook_Time", size="Reviews", hover_data=["Rating"])
new_set_graph = px.scatter(new_set, title="Newer Restaurants", x="Latitude", y="Longitude", color="Cook_Time", hover_data=["Cuisines"])
main_set_graph = px.scatter(main_set, title="All Restaurants", x="Latitude", y="Longitude", color="Cook_Time", size="Reviews", hover_data=["Rating"])
plotly.offline.plot(indian_graph, filename='indian.html', auto_open=False)
plotly.offline.plot(chinese_graph, filename='chinese.html', auto_open=False)
plotly.offline.plot(other_graph, filename='other.html', auto_open=False)
plotly.offline.plot(new_set_graph, filename='new_set.html', auto_open=False)
plotly.offline.plot(main_set_graph, filename='main_set.html', auto_open=False)


'main_set.html'

---
# Conclusions
From the graphs shown in the HTML files and the the way they are organized, the best places for consumers to order from are data points that are colored more darkly, and are preferably larger (and more noticeable), as those would imply the restaurant is reliable and efficient. From that criteria, we can derive the following:
NOTE: Key for file names
chinese.html - chinese restaurant
indian.html - indian restaurant
1. People tend to order from places that sell Chinese food the most, and although cook times wildly varied, there seemed to be a preference for that cuisine. This was followed by "other" cuisine, meaning neither chinese nor indian, and then indian, and was measured by the amount of reviews left.
2. Indian food takes the longest to prepare and cook, evident by the number of data points that are colored to the end of the spectrum that shows a higher cooking time. On the flip side, "other" cuisine seemed to take the least amount of time to prepare.
3. Geographical location does not seem to have an effect on how many people ordered from a place, although there are certain locations on graphs that do seem to correlate with a spike in popularity. For example, in the graph for chinese restaurants, a latitude of ~39.75 and Longitude of -84.4-(-84.6) seemed to be a hotspot for relatively popular restaurants. However, a vast majority of the areas in others show almost no correlation.
4. There is (surprisingly) not a very strong correlation between the time to cook, number of reviews, and the average rating (shown in the hover data). If there is a very high cooking time, then there is always an above average amount of reviews, and tends to be on the lower side (below 4.5). Those that are darker colored (meaning lower cooking time), and have a large amount of reviews, on the other hand, tend to have a mix of ratings, however, ranging from below 4, all the way to above 4.5.