## Data Resource: Yelp Open Dataset 
Download: https://www.yelp.com/dataset 

Presenting and/or publishing the research based on this dataset is permitted for acadamic purpose.

Two datasets used here: yelp_academic_dataset_business.json and yelp_academic_dataset_tip_text.json

## this notebook generates two processed datasets that will be used in Recommender


# Data Collection and Preliminary Cleaning

In [1]:
# Environment
import json
import pandas as pd

## business dataset

In [2]:
# Read JSON objects line by line and store them in a list
json_objects = []
with open('yelp_dataset/yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        json_objects.append(json.loads(line))

# Convert the list of JSON objects to a DataFrame
business = pd.DataFrame(json_objects)
print(business.shape)
print(business.columns)

(150346, 14)
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')


In [3]:
# collect the useful varaibles
selected_columns = ['business_id', 'name', 'address', 'city', 'state', 'postal_code','stars', 'is_open', 'review_count', 'categories']
business = business[selected_columns]
# drop NA records
business.dropna(inplace=True)

# drop is_open=0 records (closed business)
business = business.loc[business['is_open'] != 0]

# drop non-restaurant records
business = business.loc[business['categories'].str.contains('Restaurants')]

selected_columns = ['business_id', 'name', 'address', 'city', 'state', 'postal_code','stars', 'review_count']

business.shape

(34987, 10)

In [4]:
# save to tip_business.csv
business.to_csv('tip_business.csv', index=False)

## tip dataset

In [5]:
# Read JSON objects line by line and store them in a list
json_objects = []
with open('yelp_dataset/yelp_academic_dataset_tip.json', 'r') as file:
    for line in file:
        json_objects.append(json.loads(line))

# Convert the list of JSON objects to a DataFrame
tip = pd.DataFrame(json_objects)
tip.head(1)

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0


In [6]:
# collect the useful varaibles
selected_columns = ['business_id', 'text']
tip = tip[selected_columns]
tip = tip.astype({'business_id': 'str', 'text': 'str'})

In [7]:
# filter the data, only keep resturants with 10+ tips
tip = tip.groupby('business_id').filter(lambda x: len(x) >= 10)
tip.head

<bound method NDFrame.head of                    business_id  \
0       3uLgwr0qeCNMjKenHJwPGQ   
1       QoezRbYQncpRqyrLH6Iqjg   
4       _uN0OudeJ3Zl_tf6nxg5ww   
6       kH-0iXqkL7b8UXNpguBMKg   
7       jtri188kuhe_AuEOJ51U_A   
...                        ...   
908906  90pJu2O7fIEm_N31Fyue7A   
908907  wQUBiBqlzC6cbdkX-GaBqQ   
908912  hYnMeAO77RGyTtIzUSKYzQ   
908913  s2eyoTuJrcP7I_XyjdhUHQ   
908914  _cb1Vg1NIWry8UA0jyuXnQ   

                                                     text  
0                          Avengers time with the ladies.  
1       They have lots of good deserts and tasty cuban...  
4                  Appetizers.. platter special for lunch  
6       Saturday, Dec 7th 2013, ride Patco's Silver Sl...  
7       This is probably the best place in the cool Sp...  
...                                                   ...  
908906       Great food, cocktail, ambience, and service!  
908907                             The food was delicious  
908912               

# Text Preprocessing

In [8]:
# read dataset
# tip = pd.read_csv('tip_0.csv')
tip = tip.astype({'business_id': 'str', 'text': 'str'})
tip.head(1)

Unnamed: 0,business_id,text
0,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.


In [9]:
tip.shape

(657811, 2)

In [10]:
# delete non-English records
import string

english_tips = []

for index, row in tip.iterrows():
    # check if there is only ASCII strings
    if row['text'].encode('ascii', 'ignore').decode('ascii') == row['text']:
        english_tips.append(row)
        
# transform into dataframe
tip = pd.DataFrame(english_tips, columns=tip.columns)

In [11]:
tip.shape

(654119, 2)

In [12]:
# Environment 
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the required resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and digits
    text = re.sub(f"[{string.punctuation}0-9]", " ", text)
    
    # Tokenize words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words back to the standard form
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Rejoin words back to text string
    text = " ".join(words)
    return text

# Apply the preprocessing function to the 'text' column of the 'review' DataFrame
tip["clean_text"] = tip["text"].apply(preprocess_text)

# Display the first 5 rows of the updated DataFrame
print(tip.head(5))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhuliang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhuliang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/zhuliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zhuliang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


              business_id                                               text  \
0  3uLgwr0qeCNMjKenHJwPGQ                     Avengers time with the ladies.   
1  QoezRbYQncpRqyrLH6Iqjg  They have lots of good deserts and tasty cuban...   
4  _uN0OudeJ3Zl_tf6nxg5ww             Appetizers.. platter special for lunch   
6  kH-0iXqkL7b8UXNpguBMKg  Saturday, Dec 7th 2013, ride Patco's Silver Sl...   
7  jtri188kuhe_AuEOJ51U_A  This is probably the best place in the cool Sp...   

                                          clean_text  
0                                  avenger time lady  
1               lot good desert tasty cuban sandwich  
4                    appetizer platter special lunch  
6  saturday dec th ride patco silver sleigh w san...  
7  probably best place cool spring area watch gam...  


## Sentiment Analysis
Collect the positive tip text records

In [13]:
import pandas as pd
# read dataset
# tip = pd.read_csv('tip_1.csv')
tip = tip.astype({'business_id': 'str', 'clean_text': 'str'})

In [14]:
# get the sentiment for each tip text

from textblob import TextBlob

sentiment = []

for index, row in tip.iterrows():
    # sentiment analysis
    blob = TextBlob(row['clean_text'])
    polarities = [s.sentiment.polarity for s in blob.sentences]
    if len(polarities) > 0:
        polarity = sum(polarities) / len(polarities)
    else:
        polarity = 0

    if polarity > 0:
        sentiment.append('positive')
    elif polarity < 0:
        sentiment.append('negative')
    else:
        sentiment.append('neutral')

# add variable sentiment to dataframe tip
tip['sentiment'] = sentiment
tip.shape

(654119, 4)

In [15]:
# get the positive-only tip text
tip = tip[tip['sentiment'] == 'positive'].copy()
tip.shape

(426077, 4)

In [16]:
# save to tip_text.csv
tip.to_csv('tip_text.csv', index=False)