In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import seaborn as sns
from pandas.core.frame import DataFrame

In [None]:
# Read review
bars_review = pd.read_csv('bars_review.csv')

# Read business
bars_business = pd.read_csv('bars_business.csv')

In [None]:
# Get the distribution of reviews ratings
star_review=bars_review['stars'].value_counts()
star_review=star_review.sort_index()

# Plot
plt.figure(figsize=(8,4))
ax= sns.barplot(star_review.index, star_review.values, alpha=0.8)
plt.title("Star Rating Distribution of Bars Business Review")
plt.ylabel('Number of Reviews', fontsize=12)
plt.xlabel('Star Ratings ', fontsize=12)

# Adding the text labels
rects = ax.patches
labels = star_review.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.savefig('star_rating_distribution.png')
plt.show()

In [None]:
# Get the distribution of business ratings
star2=bars_business['stars'].value_counts()
star2=star2.sort_index()
# Plot
plt.figure(figsize=(8,4))
ax= sns.barplot(star2.index, star2.values, alpha=0.8)
plt.title("Star Rating Distribution of All Bars",fontsize=20)
plt.ylabel('Number of Businesses', fontsize=18)
plt.xlabel('Star Ratings ', fontsize=18)

# Adding the text labels
rects = ax.patches
labels = star2.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.savefig('star_rating_distribution_bars.png')
plt.show()

In [None]:
# Count whole business categories
business.index = range(len(business))
whole_categories_count = {}
for i in tqdm(range(len(business))):
    if business.loc[i, 'categories']:
        curr = business.loc[i, 'categories'].split(',')
        for e in curr:
            e = e.strip()
            whole_categories_count[e] = whole_categories_count.get(e, 0)+1

# Sort the categories
whole_sorted_cat = sorted(whole_categories_count.items(),key = operator.itemgetter(1))
whole_sorted_cat=whole_sorted_cat[::-1][0:20]
whole_sorted_cat

# Save top 20 categories into DataFrame
whole_cat_df=DataFrame(whole_sorted_cat)
dic = {'cat_name': whole_cat_df[0], 'cat_count':whole_cat_df[1]}
whole_cat_df=DataFrame(dic)

# Plot top 20 categories
plt.subplots(figsize=(8, 8))
labels=whole_cat_df['cat_name']
whole_cat_df['cat_count'].plot.bar( align='center', alpha=0.5)
y_pos = np.arange(len(labels))
plt.rc('xtick', labelsize=16) 
plt.xticks(y_pos, labels)
plt.xlabel('All Business Categories',fontsize=18)
plt.ylabel('Categories Count',fontsize=18)
plt.title('Top Categories Count of All Business',fontsize=20)
plt.show()

In [None]:
# Count restaurant business categories
business.index = range(len(business))
restaurant = []
categories_count = {}
for i in tqdm(range(len(business))):
    if business.loc[i, 'categories']:
        if "Restaurants" in business.loc[i, 'categories'] or " Restaurants" in business.loc[i, 'categories']:
            restaurant.append(i)
            curr = business.loc[i, 'categories'].split(',')
            for e in curr:
                e = e.strip()
                categories_count[e] = categories_count.get(e, 0)+1

# Sort the categories
sorted_cat = sorted(categories_count.items(),key = operator.itemgetter(1))
sorted_cat = sorted_cat[::-1][1:20]

# Save top 20 categories into DataFrame
cat_df=DataFrame(sorted_cat)
dic = {'cat_name': cat_df[0], 'cat_count':cat_df[1]}
cat_df=DataFrame(dic)

# Plot top 20 categories
plt.subplots(figsize=(8, 8))
labels=cat_df['cat_name']
cat_df['cat_count'].plot.bar( align='center', alpha=0.5)
y_pos = np.arange(len(labels))
plt.rc('xtick', labelsize=16) 
plt.xticks(y_pos, labels)
plt.xlabel('All Restaurants Categories',fontsize=18)
plt.ylabel('Categories Count',fontsize=18)
plt.title('Top Categories Count of All Restaurants',fontsize=20)
plt.show()

In [None]:
# Count and plot specific words frequency in different ratings
words_by_stars = pd.read_csv('words_by_stars.csv')
words_by_stars_2gram = pd.read_csv('words_by_stars_2gram.csv')

# Initialize counter of 1-gram
stars_review_total = [0] * 5
for i in range(len(words_by_stars)):
    stars_review_total[words_by_stars['star'][i]-1] += words_by_stars['freq'][i]

# Initialize counter of 2-gram
stars_2gram_review_total = [0] * 5
for i in range(len(words_by_stars_2gram)):
    stars_2gram_review_total[words_by_stars_2gram['star'][i]-1] += words_by_stars_2gram['freq'][i]

# Example of plotting words 'table'
tmp = words_by_stars[words_by_stars['words'] == 'tabl']
title = 'table'
title = title[0].upper()+title[1:]

fig = plt.figure(figsize=(16.0, 10.0))
ax = fig.add_subplot(111)
x = np.arange(5)+1
y = tmp['freq']/stars_review_total
ax.bar(x,y)
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1e'))
plt.xlabel('star', fontsize=28)
plt.ylabel('frequency', fontsize=28)
plt.rc('xtick', labelsize=24) 
plt.rc('ytick', labelsize=24) 
plt.title(title, fontsize=34)
plt.savefig('table.png')
plt.show()


# Example of plotting words 'not clean'
tmp = words_by_stars_2gram[words_by_stars_2gram['words'] == 'not clean']
titlelist = 'not clean'.split(' ')
res = []
for word in titlelist:
    word = word[0].upper() + word[1:]
    res.append(word)
title = ' '.join(res)
    
fig = plt.figure(figsize=(16.0, 10.0))
ax = fig.add_subplot(111)
x = np.arange(5)+1
y = tmp['freq']/stars_2gram_review_total
ax.bar(x,y)
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1e'))
plt.xlabel('star', fontsize=28)
plt.ylabel('frequency', fontsize=28)
plt.rc('xtick', labelsize=24) 
plt.rc('ytick', labelsize=24) 
plt.title(title, fontsize=34)
plt.savefig('not clean.png')
plt.show()