In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd '/content/drive/My Drive/cs109_cassie'

/content/drive/My Drive/cs109_cassie


In [11]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
import re


# Data Preprocessing

## Prepare User Review Data for Embedding:
1. English Reviews
2. Emoji removed
3. k-core: 10 for user and 50 for restaurants
4. concatenate all reviews for each user

### Results Dataframe
'user_comment_df':
2 columns: user_id and text;
10948 rows


In [53]:
review_df = pd.read_csv("data/english_reviews.csv")

In [54]:
review_df.head()

Unnamed: 0,user_id,gmap_id,text,rating
0,1.007721e+20,0x89e3706a62ba248f:0xdd2878d145d53400,"The food and ambiance are great, and there is ...",5
1,1.10358e+20,0x89e3706a62ba248f:0xdd2878d145d53400,"Great location in East Boston, nice place. exc...",5
2,1.089811e+20,0x89e3706a62ba248f:0xdd2878d145d53400,We loved this place and are sad that it is closed,5
3,1.096653e+20,0x89e3706a62ba248f:0xdd2878d145d53400,(Translated by Google) Rancheros eggs!\n\n(Ori...,5
4,1.108542e+20,0x89e316bb7ceaaaab:0x2b1a0e9e697640fe,Great place to grab a take out breakfast or lunch,4


In [55]:
emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)

has_emoji = review_df.text.apply(lambda x: bool(emoji_pattern.search(x)))
review_df[has_emoji].text.count()
review_df['text'] = review_df['text'].apply(lambda x: emoji_pattern.sub(r'', x))

review_df.head()

Unnamed: 0,user_id,gmap_id,text,rating
0,1.007721e+20,0x89e3706a62ba248f:0xdd2878d145d53400,"The food and ambiance are great, and there is ...",5
1,1.10358e+20,0x89e3706a62ba248f:0xdd2878d145d53400,"Great location in East Boston, nice place. exc...",5
2,1.089811e+20,0x89e3706a62ba248f:0xdd2878d145d53400,We loved this place and are sad that it is closed,5
3,1.096653e+20,0x89e3706a62ba248f:0xdd2878d145d53400,(Translated by Google) Rancheros eggs!\n\n(Ori...,5
4,1.108542e+20,0x89e316bb7ceaaaab:0x2b1a0e9e697640fe,Great place to grab a take out breakfast or lunch,4


In [56]:
comment_counts = review_df['user_id'].value_counts()
threshold = 10

users_with_more_threshold = comment_counts[comment_counts >= 10].index
review_user10_df = review_df[review_df['user_id'].isin(users_with_more_threshold)]


resaurant_comments_counts = review_user10_df['gmap_id'].value_counts()

restaurant_with_more_threshold = resaurant_comments_counts[resaurant_comments_counts >= 50].index
review_rest50_df = review_user10_df[review_user10_df['gmap_id'].isin(restaurant_with_more_threshold)]
review_rest50_df.head()

Unnamed: 0,user_id,gmap_id,text,rating
6477,1.182288e+20,0x89e4f921a38f0b11:0x879ccb1d12f010de,Love the chicken sandwiches but the service ne...,3
6483,1.09005e+20,0x89e4f921a38f0b11:0x879ccb1d12f010de,The place was super smokey when we came in whi...,3
6485,1.128417e+20,0x89e4f921a38f0b11:0x879ccb1d12f010de,"While I love all their chicken offerings, my w...",5
6488,1.1752e+20,0x89e4f921a38f0b11:0x879ccb1d12f010de,Always the best fried chicken around. Mash pot...,5
6493,1.130399e+20,0x89e4f921a38f0b11:0x879ccb1d12f010de,Awesome chicken crisp not greasy!,5


In [61]:
# concatenate user reviews
review_rest50_df['text'] = review_rest50_df['text'].str.replace('\s+', ' ', regex=True)
concatenated_comments_df = review_rest50_df.groupby('user_id')['text'].apply(lambda x: '\n'.join(x)).reset_index(name='text')
concatenated_user_comments_df = concatenated_comments_df.drop_duplicates()
concatenated_user_comments_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_rest50_df['text'] = review_rest50_df['text'].str.replace('\s+', ' ', regex=True)


Unnamed: 0,user_id,text
0,1.000018e+20,Great food. The service was a little slow. It ...
1,1.000020e+20,We had a large party but they accommodated us ...
2,1.000027e+20,Margarita was yummy. Food not great.\nWe love ...
3,1.000030e+20,I eat there when I have a chance to do it I ha...
4,1.000057e+20,Drinks desert and Jarvis tops off a good eveni...
...,...,...
10943,1.184382e+20,Decent price and portions for the fish and chi...
10944,1.184401e+20,Breakfast at 10 and been waiting for food for ...
10945,1.184433e+20,Was great food and service\nFun place for all ...
10946,1.184449e+20,Only came because we had a gift card. Reminded...


In [62]:
concatenated_user_comments_df.to_csv("data/user_comment_df.csv", index=False)

## Prepare Merged Dataframe for Later Regression Model

Merge the processed (non-english&emoji removed, k-core extracted) data with only text with the original dataframe; perform data cleaning (drop missing values&weird input from price and keep only the relevant columns)

In [63]:
df = pd.read_csv("data/data_ma.csv")
merged_df = pd.merge(review_rest50_df, df, on=['user_id', 'gmap_id'])
df_cleaned = merged_df[['user_id', 'name_y', 'rating_x', 'text_x', 'gmap_id', 'latitude', 'longitude', 'num_of_reviews', 'price', 'avg_rating']]
df_cleaned= df_cleaned.rename(columns={
    'rating_x': 'rating',
    'text_x': 'text'
})
print(df_cleaned.head())
print(df_cleaned.isnull().any())

  df = pd.read_csv("data/data_ma.csv")


        user_id                     name_y  rating  \
0  1.182288e+20  Popeyes Louisiana Kitchen       3   
1  1.090050e+20  Popeyes Louisiana Kitchen       3   
2  1.128417e+20  Popeyes Louisiana Kitchen       5   
3  1.175200e+20  Popeyes Louisiana Kitchen       5   
4  1.130399e+20  Popeyes Louisiana Kitchen       5   

                                                text  \
0  Love the chicken sandwiches but the service ne...   
1  The place was super smokey when we came in whi...   
2  While I love all their chicken offerings, my w...   
3  Always the best fried chicken around. Mash pot...   
4                  Awesome chicken crisp not greasy!   

                                 gmap_id   latitude  longitude  \
0  0x89e4f921a38f0b11:0x879ccb1d12f010de  41.695425 -71.140807   
1  0x89e4f921a38f0b11:0x879ccb1d12f010de  41.695425 -71.140807   
2  0x89e4f921a38f0b11:0x879ccb1d12f010de  41.695425 -71.140807   
3  0x89e4f921a38f0b11:0x879ccb1d12f010de  41.695425 -71.140807   
4  0x89e

In [64]:
# drop missing values
df_cleaned= df_cleaned.dropna()
df_cleaned


Unnamed: 0,user_id,name_y,rating,text,gmap_id,latitude,longitude,num_of_reviews,price,avg_rating
0,1.182288e+20,Popeyes Louisiana Kitchen,3,Love the chicken sandwiches but the service ne...,0x89e4f921a38f0b11:0x879ccb1d12f010de,41.695425,-71.140807,526,$,3.6
1,1.090050e+20,Popeyes Louisiana Kitchen,3,The place was super smokey when we came in whi...,0x89e4f921a38f0b11:0x879ccb1d12f010de,41.695425,-71.140807,526,$,3.6
2,1.128417e+20,Popeyes Louisiana Kitchen,5,"While I love all their chicken offerings, my w...",0x89e4f921a38f0b11:0x879ccb1d12f010de,41.695425,-71.140807,526,$,3.6
3,1.175200e+20,Popeyes Louisiana Kitchen,5,Always the best fried chicken around. Mash pot...,0x89e4f921a38f0b11:0x879ccb1d12f010de,41.695425,-71.140807,526,$,3.6
4,1.130399e+20,Popeyes Louisiana Kitchen,5,Awesome chicken crisp not greasy!,0x89e4f921a38f0b11:0x879ccb1d12f010de,41.695425,-71.140807,526,$,3.6
...,...,...,...,...,...,...,...,...,...,...
100611,1.178104e+20,Tasty Burger,5,Grab the starving student deal!,0x89e379f5e3d9fe1b:0xa265a864b0f05b6a,42.344762,-71.098375,2713,$,4.5
100612,1.172052e+20,Tasty Burger,5,Love the burgers here.,0x89e379f5e3d9fe1b:0xa265a864b0f05b6a,42.344762,-71.098375,2713,$,4.5
100613,1.145775e+20,Tasty Burger,5,Best burger in Boston by far.,0x89e379f5e3d9fe1b:0xa265a864b0f05b6a,42.344762,-71.098375,2713,$,4.5
100614,1.157460e+20,Tasty Burger,5,Best fast burgers around especially late at night,0x89e379f5e3d9fe1b:0xa265a864b0f05b6a,42.344762,-71.098375,2713,$,4.5


In [65]:
# drop rows with price as value '₩₩', '₩',   '₩₩₩₩'
df_cleaned = df_cleaned[~df_cleaned['price'].isin(['₩₩', '₩', '₩₩₩₩', '₩₩₩'])]
df_cleaned.price.value_counts()

price
$$      61967
$       26316
$$$      4202
$$$$      566
Name: count, dtype: int64

In [66]:
df_cleaned.to_csv("data/cleaned_df.csv", index=False)

# Performing LDA to vectorize concatenated User Reviews

## Some Considerations:

Count Vectorizer or TF-IDF Vectorizer for Topic Analysis:

I decided to go with TF-IDF Vectorizer for following reasons:
For our ultimate goal of building a recommendation system based on user reviews, I prefer TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer over CountVectorizer:

TF-IDF Weights: TF-IDF considers the importance of a term not only based on its frequency in a document but also on its rarity across all documents. This helps in capturing the uniqueness of terms within each document (user review) and reduces the impact of commonly occurring words across all reviews, which might not carry much significance.
Normalization: TF-IDF vectors are normalized, which means they are less sensitive to the length of the documents. This is important because user reviews can vary significantly in length, and we want the similarity between users to be based on the content rather than the length of their reviews.

Feature Selection: TF-IDF allow us to specify maximum features to consider, helping to filter out less important terms which can improve the quality of the representation.
Effective for Similarity Search: TF-IDF vectors are effective for measuring the similarity between documents (users in this case) using cosine similarity or other distance metrics. This will be useful when we want to find similar users based on their review vectors.

Using TF-IDF vectors for representing user reviews should provide a more meaningful representation of the content of each review, which can lead to better recommendations for similar users and ultimately better restaurant recommendations for the target user.

In [92]:
user_reviews_df = pd.read_csv('data/user_comment_df.csv')
user_reviews_df.head()

Unnamed: 0,user_id,text
0,1.000018e+20,Great food. The service was a little slow. It ...
1,1.00002e+20,We had a large party but they accommodated us ...
2,1.000027e+20,Margarita was yummy. Food not great.\nWe love ...
3,1.00003e+20,I eat there when I have a chance to do it I ha...
4,1.000057e+20,Drinks desert and Jarvis tops off a good eveni...


## Preprocessing Text before Using TF-IDF Vectorizer

In [93]:
!pip install gensim
!pip install nltk



In [94]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
# Initialize tokenizer, lemmatizer, and stopwords
tokenizer = RegexpTokenizer(r'\w+')
lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocess and tokenize function
def preprocess_tokenize_text(text):
    # Remove non-relevant words
    text = re.sub(r'\(Original\)', '', text)
    text = re.sub(r'\(Translated by Google\)', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = tokenizer.tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmer.lemmatize(w) for w in tokens]

    # Remove single letter tokens
    tokens = [w for w in tokens if len(w) > 1]

    # Join tokens back to a single string
    return ' '.join(tokens)

In [96]:
user_reviews_df['processed_text'] = user_reviews_df['text'].apply(preprocess_tokenize_text)
user_reviews_df.head()

Unnamed: 0,user_id,text,processed_text
0,1.000018e+20,Great food. The service was a little slow. It ...,great food service little slow nice clean food...
1,1.00002e+20,We had a large party but they accommodated us ...,large party accommodated could watch patriot g...
2,1.000027e+20,Margarita was yummy. Food not great.\nWe love ...,margarita yummy food great love tasty burger w...
3,1.00003e+20,I eat there when I have a chance to do it I ha...,eat chance going since kid go area love food w...
4,1.000057e+20,Drinks desert and Jarvis tops off a good eveni...,drink desert jarvis top good evening tewkesbur...


## Train a LDA Model and vectorize documents

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [100]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Vectorize the processed text
X = vectorizer.fit_transform(user_reviews_df['processed_text'])

# Initialize LDA Model
n_topics = 15
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)

# Train LDA model
lda.fit(X)



# Add topic distribution vectors to DataFrame
topic_distributions = lda.transform(X)
user_reviews_df['topic_vector'] = list(topic_distributions)

user_reviews_df.head()

Unnamed: 0,user_id,text,processed_text,topic_vector
0,1.000018e+20,Great food. The service was a little slow. It ...,great food service little slow nice clean food...,"[0.009604099033542457, 0.009604097040164497, 0..."
1,1.00002e+20,We had a large party but they accommodated us ...,large party accommodated could watch patriot g...,"[0.9332929748068605, 0.004764784389977953, 0.0..."
2,1.000027e+20,Margarita was yummy. Food not great.\nWe love ...,margarita yummy food great love tasty burger w...,"[0.0068352206810172785, 0.00683521500500783, 0..."
3,1.00003e+20,I eat there when I have a chance to do it I ha...,eat chance going since kid go area love food w...,"[0.011245047569970199, 0.011245043923995425, 0..."
4,1.000057e+20,Drinks desert and Jarvis tops off a good eveni...,drink desert jarvis top good evening tewkesbur...,"[0.005380025279412345, 0.005380015639471415, 0..."


## Visualize The topics

In [101]:
!pip install pyldavis



In [104]:

import pyLDAvis
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
lda_display = pyLDAvis.lda_model.prepare(lda, X, vectorizer, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)


In [None]:
user_reviews_df.to_csv("data/user_reviews_lda_df.csv", index=False)