## STAT628 Module 3 Yelp Data 
### Brian Tsai 
#### mtsai36@wisc.edu

### Importing Required Packages

In [2]:
import numpy as np
import pandas as pd
import os
import json
from scipy import stats as st 
from collections import Counter
import matplotlib.pyplot as py
import seaborn as sns 
#from langdetect import detect_langs
import re

### Reading Business json data 

In [3]:
data = []
with open('C:/Users/tsai_/Downloads/yelp_dataset/yelp_dataset/business.json', encoding = 'utf8') as fl:
    for i, line in enumerate(fl):
        data.append(json.loads(line))

# Mexican restaurants 
mex = [] 
for i in range(len(data)): 
    if data[i]['categories'] != None: 
        if 'Mex' in data[i]['categories']:
            mex.append(data[i])

# Mexican restuarants that are open            
mex_open = []
for i in mex: 
    if i['is_open'] == 1: 
        mex_open.append(i)



    
# Chipotle restaurants with >= 10 reviews 
chipotle = [] 
chipotle_id = [] 
for i in mex_open:
    if 'Chipotle Mex' in i['name'] and i['review_count'] >= 10:
        chipotle.append(i) 
        chipotle_id.append(i['business_id'])


## Reviews 

In [4]:
#Chipotle Reviews         

chipotle_reviews = [] 
with open('C:/Users/tsai_/Downloads/yelp_dataset/yelp_dataset/review.json', encoding = 'utf8') as fl:
    for i, line in enumerate(fl):
        x = json.loads(line)
        if x['business_id'] in chipotle_id: 
            chipotle_reviews.append(x)        



## Exploratory Data Analysis 

In [None]:
path = 'C:/Users/tsai_/Desktop/U Wisconsin - Madison/Courses/2nd year/Fall 2021/STAT 628/Module 3'
df = pd.DataFrame(chipotle_reviews)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

# Number of chipotle reviews per month plot
py.plot(df['text'].resample('M').count())
py.xlabel('Year')
py.ylabel('Number of reviews')
py.title('Number of reviews over time')
py.savefig(path + 'chipotle reviews over time.png')
py.show()
py.close()

# Distribution of Chipotle Customer ratings
ax = sns.barplot(data=df, x='stars', y='stars', estimator=lambda x: len(x) / len(df) * 100)
ax.set(ylabel='Percent')
py.title('Distribution of Customer Rating')
py.savefig(path + 'Chipotle customer ratings distribution.png')
py.show()
py.close()

# Average Monthly Customer Rating Chipotle
py.plot(df['stars'].resample('M').mean())
py.xlabel('Year')
py.ylabel('Rating')
py.title('Average Monthly Customer Rating')
py.ylim(0, 5)
py.savefig(path + 'chipotle average monthly customer rating.png')
py.show()
py.close()

## Preliminary Chipotle NLP 

In [None]:
# Checking for reviews not in English 
language = [detect_langs(i) for i in df.text]
languages = [str(i[0]).split(':')[0] for i in language]
df['language'] = languages

# Remove reviews not in English
df = df[df.language == 'en']


# Stopwords
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
my_stop_words = set(stopwords.words('english')+list(ENGLISH_STOP_WORDS)+['chipotle', 'Chipotle'])

# Nouns and adjectives from Taco Bell reviews
nouns = []
txt = ' '.join(df['text'])
sentences = nltk.sent_tokenize(txt)
for sentence in sentences:
    for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
        if pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS':
            nouns.append(word)

adj = []
for sentence in sentences:
    for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
        if pos == 'JJ' or pos == 'JJR' or pos == 'JJS':
            adj.append(word)

s = ''
for i in nouns:
    s += (i + ' ')

s1 = ''
for i in adj:
    s1 += (i + ' ')
    

#Word Cloud
#noun
from wordcloud import WordCloud
cloud_no_stopword = WordCloud(background_color='white', stopwords=my_stop_words).generate(s)
py.imshow(cloud_no_stopword, interpolation='bilinear')
py.axis('off')
py.savefig(path + 'Chipotle noun word cloud.png')
py.show()
py.close()


#adj
cloud_no_stopword = WordCloud(background_color='white', stopwords=my_stop_words).generate(s1)
py.imshow(cloud_no_stopword, interpolation='bilinear')
py.axis('off')
py.savefig(path + 'Chipotle adj word cloud.png')
py.show()
py.close()

def wc(df):
    nouns = []
    adj = []
    txt = ' '.join(df['text'])
    sentences = nltk.sent_tokenize(txt)
    for sentence in sentences:
        for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
            if pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS':
                nouns.append(word)
            if pos == 'JJ' or pos == 'JJR' or pos == 'JJS':
                adj.append(word)
    s = ''
    for i in nouns:
        s += (i + ' ')
    s1 = ''
    for i in adj:
        s1 += (i + ' ')
    cloud_no_stopword = WordCloud(background_color='white', stopwords=my_stop_words).generate(s)
    py.imshow(cloud_no_stopword, interpolation='bilinear')
    py.axis('off')
    py.show()
    py.close()
    cloud_no_stopword = WordCloud(background_color='white', stopwords=my_stop_words).generate(s1)
    py.imshow(cloud_no_stopword, interpolation='bilinear')
    py.axis('off')
    py.show()
    py.close()
    
    
# Tokenization and Bag of words
# Top 20 most frequent words
from nltk.tokenize import word_tokenize
from nltk import FreqDist
lower_full_text = full_text.lower()
word_tokens = word_tokenize(lower_full_text)
tokens = [] 
for word in word_tokens:
    if word.isalpha() and word not in my_stop_words:
        tokens.append(word)
token_dist = FreqDist(tokens)
dist = pd.DataFrame(token_dist.most_common(20),columns=['Word', 'Frequency'])
print(dist)

## Sentiment Analysis VADER

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
# Generate sentiment scores
sentiment_scores = df['text'].apply(sid.polarity_scores)
sentiment = sentiment_scores.apply(lambda x: x['compound'])
monthly_sentiment = sentiment.resample('M').mean()
py.plot(monthly_sentiment, color='blue')
py.axhline(color='red')
py.xlabel('Year')
py.ylabel('Sentiment')
py.title('Average Sentiment Score over time')
py.show()
py.savefig(path + 'Chipotle sentiment score over time.png')
py.close()