<a href="https://colab.research.google.com/github/varunvijay8/yelp_challenge/blob/master/cluster_encoded_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


# Installs

This installs Apache Spark 3.0.0, Java 8, and Findspark, a library that makes it easy for Python to find Spark.

In [None]:
#@title Use java 8 since spark does not work well with java 11
#!pip install pyspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop3.2.tgz
!tar -xvf spark-3.0.0-preview2-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
#!pip install tf-nightly
!pip install tensorflow==2.1.0

In [None]:
!pip install nltk

In [None]:
!pip install scikit-learn

In [None]:
!pip install spacy==2.2

In [None]:
!spacy download en

#Set Environment Variables

In [None]:
#@title Set the locations where Spark and Java are installed.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-preview2-bin-hadoop3.2"

# Imports

In [None]:
import re

In [None]:
import numpy as np
import pandas as pd

In [None]:
import findspark
findspark.init()

import pyspark
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag, pos_tag_sents
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('brown')

In [None]:
from textblob import TextBlob

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import spacy

In [None]:
YELP_REVIEW_PATH = "drive/My\ Drive/yelp_dataset_1/review.json"
YELP_BUSINESS_PATH = "drive/My\ Drive/yelp_dataset_1/business.json"

In [None]:
LABELED_REVIEW_PATH = r"drive/My Drive/review_1000_sent_token.xlsx"

# Start Spark session

In [None]:
spark = pyspark.sql.SparkSession.builder.appName('yelp_eda').getOrCreate()
spark

In [None]:
review_df = spark.read.format('json').option("inferSchema", True).load(YELP_REVIEW_PATH)
review_df.printSchema()

In [None]:
business_df = spark.read.format('json').option("inferSchema", True).load(YELP_BUSINESS_PATH)
business_df.printSchema()

In [None]:
#@title Merge review and restaurant business dataframes on business id
restaurant_business = business_df.where('categories like "%Restaurant%"').drop('stars')
merged_df = review_df.join(restaurant_business, on="business_id", how="inner")
merged_df.head()

In [None]:
#@title SQL command helper
def run_sql(statement):
    try:
        result = spark.sql(statement)
    except Exception as e:
        print(e.desc, '\n', e.stackTrace)
        return
    return result

In [None]:
!rm -r spark-warehouse/yelp_dataset.db

In [None]:
run_sql('drop database if exists yelp_dataset cascade')
run_sql('create database yelp_dataset')
dbs = run_sql('show databases')
dbs.toPandas()

In [None]:
#@title Create database
permanent_table_name_reviews = "yelp_dataset.Reviews"
spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")
merged_df.write.mode('overwrite').format("parquet").saveAsTable(permanent_table_name_reviews)

In [None]:
run_sql('use yelp_dataset')
run_sql('REFRESH table Reviews')
tbls = run_sql('show tables')
tbls.toPandas()

# Pick random sample of 5000 reviews

In [None]:
#@title Create dataframe from 5000 randomly picked reviews
result = run_sql('''
                    SELECT text 
                    FROM reviews
                    ORDER BY RAND()
                    LIMIT 5000
                 ''')
documents_df = result.toPandas()

# Pre-process review

In [None]:
#@title create a UDF out of sent_tokenize to apply to spark dataframe
udf_sent_tokenize = udf(sent_tokenize, ArrayType(StringType())).asNondeterministic()

In [None]:
#@title tokenize reviews to list of sentences, output is a list of rows with reviews tokenized to sentences
sentences = result.withColumn('text', udf_sent_tokenize(result.text)).collect()

In [None]:
#@title flatten tokenzied sentences to list
sentences_tokenized = [sent for row in sentences for sent in row.text]

In [None]:
#@title create dataframe from list of sentences
sentence_df = pd.DataFrame(sentences_tokenized, columns=['text'])

In [None]:
#@title Tokenize reviews to sentences
# sentence_df = documents_df.text.apply(sent_tokenize)
# tokenized_docs_lst = [l for i, items in sentence_df.iteritems() for l in items]
# sentence_df = pd.DataFrame(tokenized_docs_lst, columns=['text'])

In [None]:
#@title Filter out sentences that has less than 3 words
sentence_series = sentence_df[sentence_df.text.str.split().apply(len) > 3]
sentence_df = pd.DataFrame(sentence_series, columns=['text'])

In [None]:
#@title Create instance of spacy
spacy_nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
#@title Lemmatize the review sentences
def lemmatize_sentence(sentence):
  sentence_lemma = spacy_nlp(sentence)
  return " ".join([token.text.lower() if token.lemma_ == '-PRON-' else token.lemma_ for token in sentence_lemma])

In [None]:
#@title udf for lemmatize function (not yet used)
udf_lemmatize_sentence = udf(lemmatize_sentence, StringType()).asNondeterministic()

In [None]:
#@title lemmatize sentences
lemmatized_sent_series = sentence_df.text.apply(lemmatize_sentence)
lemmatized_sent_df = pd.DataFrame(lemmatized_sent_series.to_list(), columns=['review_lemma'])

# Create sentence embeddings dataframe

In [None]:
#@title Get universal sentence encoder model from TF hub
def get_uni_sentence_encoder():
  module_url='https://tfhub.dev/google/universal-sentence-encoder-large/5' #@param ['https://tfhub.dev/google/universal-sentence-encoder/3', 'https://tfhub.dev/google/universal-sentence-encoder/4', 'https://tfhub.dev/google/universal-sentence-encoder/5', 'https://tfhub.dev/google/universal-sentence-encoder-large/5']
  encoder = hub.load(module_url)
  return encoder

In [None]:
uni_sentence_encoder = get_uni_sentence_encoder()

Cluster on lemmatized sentences

In [None]:
#@title create grouped dataframes to encode in batches
grouped_df = lemmatized_sent_df.groupby(np.arange(len(lemmatized_sent_df)) // 400)

In [None]:
#@title encode lemmatized sentences in batches
encoded_sent_np = np.empty((0,512), dtype=float)
for n, gp in grouped_df:
  encoded_sent_np = np.append(encoded_sent_np, gp.apply(uni_sentence_encoder).iloc[0], axis=0)

In [None]:
#@title Create model of AgglomerativeClustering algorithm with euclidean affinity
model_for_lemmatized_sent = AgglomerativeClustering(n_clusters=2, affinity='euclidean',linkage='ward')

In [None]:
#@title Fit model
clusters_lemma = model_for_lemmatized_sent.fit(X=encoded_sent_np)#model_for_lemmatized_sent.fit(X=encoded_lemma_sent_df.iloc[0].numpy())

In [None]:
cluster_lemma_label_df = pd.DataFrame(clusters_lemma.labels_.tolist(), columns=['label'])

In [None]:
#@title Label 0 clusters service/experience related sentences
non_food_df = sentence_df[(cluster_lemma_label_df.label==0).values]
lemmatized_non_food_df = lemmatized_sent_df[(cluster_lemma_label_df.label==0).values]

In [None]:
#@title Label 1 clusters food related sentences
food_df = sentence_df[(cluster_lemma_label_df.label==1).values]
lemmatized_food_df = lemmatized_sent_df[(cluster_lemma_label_df.label==1).values]

In [None]:
lemmatized_non_food_df.tail().review_lemma.to_list()

In [None]:
lemmatized_food_df.tail().review_lemma.to_list()

In [None]:
#@title encode non-food dataframe
grouped_df = lemmatized_non_food_df.groupby(np.arange(len(lemmatized_non_food_df)) // 400)
encoded_lemma_non_food_np = np.empty((0,512), dtype=float)
for n, gp in grouped_df:
  encoded_lemma_non_food_np = np.append(encoded_lemma_non_food_np, gp.apply(uni_sentence_encoder).iloc[0], axis=0)

In [None]:
#@title encode food dataframe
grouped_df = lemmatized_food_df.groupby(np.arange(len(lemmatized_food_df)) // 400)
encoded_lemma_food_np = np.empty((0,512), dtype=float)
for n, gp in grouped_df:
  encoded_lemma_food_np = np.append(encoded_lemma_food_np, gp.apply(uni_sentence_encoder).iloc[0], axis=0)

In [None]:
lemmatized_food_df.head().review_lemma.to_list()

In [None]:
sent = 'we have sesame chicken , general tso , angel blosom , egg roll and hot and sour soup '

In [None]:
nlp = spacy.load('en')  

In [None]:
doc = nlp(sent)

In [None]:
for chunk in doc.noun_chunks:
  print(chunk.text)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
df = pd.DataFrame(doc, columns=['tokens'])

In [None]:
def return_pos(token):
  return token.pos_

In [None]:
df[(df.tokens.apply(return_pos) == 'NOUN').to_numpy()].tokens.to_list()

In [None]:
def extract_nouns(sentence, spacy_nlp_obj=nlp):
  doc_df = pd.DataFrame(spacy_nlp(sentence), columns=['tokens'])
  return doc_df[(doc_df.tokens.apply(return_pos) == 'NOUN').to_numpy()].tokens.to_list()
  

In [None]:
#@title create series of all nouns from food_df
noun_series = food_df.text.apply(extract_nouns)

In [None]:
#@title extract unique nouns
noun_set = set()
[noun_set.add(i.string) for items in noun_series for i in items]

In [None]:
#@title create dataframe of nouns
noun_df = pd.DataFrame(list(noun_set), columns=['nouns'])

In [None]:
tok = noun_df.nouns.iloc[0]

# Filter food entity

In [None]:
#@title encode all the nouns
encoded_noun = noun_df.apply(uni_sentence_encoder)

In [None]:
#@title create cluster model to filter food nouns
model_for_food_noun = AgglomerativeClustering(n_clusters=2, affinity='euclidean',linkage='ward')

In [None]:
clusters_noun = model_for_food_noun.fit(X=encoded_noun.iloc[0].numpy())

In [None]:
cluster_noun_df = pd.DataFrame(clusters_noun.labels_.tolist(), columns=['label'])

In [None]:
noun_df[(cluster_noun_df.label==0).values].nouns.to_list()

In [None]:
noun_df[(cluster_noun_df.label==1).values].nouns.to_list()

In [None]:
sent = 'we have biryani, sesame chicken , general tso , angel blosom , egg roll and hot and sour soup '

In [None]:
##@title dictionary of pos tags from spacy to nltk
# spacy_nltk_pos = {'CCONJ':'CC', # coordinating conjunction
#                   'NUM':'CD', # cardinal digit
#                   'DET':'DT', # determiner
#                   'PRON':'EX', # existential
#                   'X':'FW', # foreign word
#                   'ADP':'IN', # preposition/subordinating conjunction
#                   'ADJ':'JJ', # adjective 
#                   'ADJ':'JJR', # adjective, comparative
#                   :'JJS', # adjective, superlative
#                   :'LS', # list marker
#                   :'MD', # modal
#                   :'NN', # noun, singular
#                   :'NNS', # noun plural
#                   :'NNP', # proper noun, singular
#                   :'NNPS', # proper noun, plural 
#                   :'PDT', # predeterminer
#                   :'POS', # possessive ending
#                   :'POR', # personal pronoun
#                   :'PRP$', # possessive pronoun
#                   :'RB', # adverb 
#                   :'RBR', # adverb, comparative
#                   :'RBS', # adverb, superlative
#                   :'RP', # particle 
#                   :'TO', # to go ‘to’ the store
#                   :'UH', # interjection
#                   :'VB', # verb
#                   :'VBD', # verb, past tense
#                   :'VBG', # verb, gerund/present participle
#                   :'VBN', # verb, past participle
#                   :'VBP', # verb, sing. present, known-3d take
#                   :'VBZ', # verb, 3rd person sing. present takes
#                   :'WDT', # wh-determiner which
#                   :'WP', # wh-pronoun who, what
#                   :'WP$', # possessive wh-pronoun whose
#                   :'WRB', # wh-adverb where, when
#                   }

In [None]:
type(pos_tag(word_tokenize(sent)))

In [None]:
#@title helper to tag in nltk format but using spacy internally
def spacy_pos_tag_to_nltk(sent, spacy_obj):
  doc = spacy_obj(sent)
  return [(t.text, t.tag_) for t in doc]

In [None]:
#@title helper to chunk nouns
def chunk_nouns(sent, spacy_obj):
  # tokens = word_tokenize(sent)
  # token_tag = pos_tag(tokens)
  token_tag = spacy_pos_tag_to_nltk(sent, spacy_obj)
  #print(token_tag)
  grammar = "NP: {<JJ>?<CC>?<JJ>?<DT>?<NN.*>+}"#"NP: {<JJ>?<DT>?<NN.*>+}"
  noun_parser = RegexpParser(grammar)
  return noun_parser.parse(token_tag)

In [None]:
np_chunks = chunk_nouns(sent, nlp)

In [None]:
np_chunks

In [None]:
#@title helper to chunk nouns 2
def chunk_nouns_2(sent, spacy_obj):
  lst = []
  token_tag = spacy_pos_tag_to_nltk(sent, spacy_obj)
  grammar = "NP: {<NN.*>+}"#"NP: {<JJ>?<CC>?<JJ>?<DT>?<NN.*>+}"
  noun_parser = RegexpParser(grammar)
  parsed_output = noun_parser.parse(token_tag)
  for subtree in parsed_output.subtrees(filter=lambda t: t.label() == 'NP'):
    lst.append(' '.join(word for (word, pos) in subtree.leaves()))
  return lst

In [None]:
#@title helper to chunk nouns 3
def chunk_nouns_3(sent, spacy_obj):
  lst = []
  token_tag = spacy_pos_tag_to_nltk(sent, spacy_obj)
  #grammar = "NP: {<JJ>?<CC>?<JJ>?<DT>?<NN.*>+}"
  grammar = r"""NP: {(<JJ><CC>)?<JJ>?<NN.*>+}""" #r"""NP: {(<JJ><CC>)?<JJ>?<DT>?<NN.*>+}"""
  noun_parser = RegexpParser(grammar)
  parsed_output = noun_parser.parse(token_tag)
  for subtree in parsed_output.subtrees(filter=lambda t: t.label() == 'NP'):
    lst.append(' '.join(word for (word, pos) in subtree.leaves()))
  return lst

In [None]:
#@title filter food phrases with chunk_nouns_3
phrases_series = food_df.text.apply(chunk_nouns_3, args=(nlp,))

In [None]:
#@title extract unique nouns
phrase_set = set()
[phrase_set.add(i) for items in phrases_series for i in items]

In [None]:
#@title create dataframe of nouns
phrase_df = pd.DataFrame(list(phrase_set), columns=['phrases'])

In [None]:
#@title encode noun phrases
encoded_phrases = phrase_df.apply(uni_sentence_encoder)

In [None]:
#@title create cluster model to filter food phrases
model_for_food_phrases = AgglomerativeClustering(n_clusters=2, affinity='euclidean',linkage='ward')

In [None]:
clusters_phrase = model_for_food_phrases.fit(X=encoded_phrases.iloc[0].numpy())

In [None]:
cluster_phrase_df = pd.DataFrame(clusters_phrase.labels_.tolist(), columns=['label'])

In [None]:
phrase_df[(cluster_phrase_df.label==0).values].phrases.to_list()

In [None]:
phrase_df[(cluster_phrase_df.label==1).values].phrases.to_list()

In [None]:
len(food_df)

In [None]:
#@title get aggregated encoding of food phrases
encoded_pharses_np = encoded_phrases.iloc[0].numpy()

In [None]:
food_pharses_np = encoded_pharses_np[(cluster_phrase_df.label==0).values]

In [None]:
food_phrases_mean_np = np.mean(food_pharses_np, axis=0)

In [None]:
#@title get encoding of food
food_encoding = uni_sentence_encoder(['food'])

In [None]:
food_np = food_encoding.numpy()[0]

In [None]:
#@title euclidean distance between food and food phrase aggregation
np.linalg.norm(food_np - food_phrases_mean_np)

# Filter reviews based on attributes

In [None]:
def euclidean_distance_from_filter(sentence_tensors, filter_tensor):
  """ Returns the euclidean distance betweeen sentence tensors and filter tensor """
  return tf.map_fn(tf.norm, tf.subtract(sentence_tensors, filter_tensor))

def similar_sentences(sent_filter_euclidean_tensor, sent=sentence_df, euclidean_dist=1.5):
  """ Returns list of sentences with euclidean distance less than euclidean_dist from filter """
  return sent[sent_filter_euclidean_tensor.numpy() < euclidean_dist].text.to_list()

In [None]:
def sentence_to_filter_cosine_similarity(sentence, filter: list, correlation=0.75):
  """Returns True if sentence contains word similar to filter by measuring correlation between words and filter"""
  correlation_tensor = tf.tensordot(uni_sentence_encoder(word_tokenize(sentence)), uni_sentence_encoder(filter), axes= [[1], [1]])
  #print(correlation_tensor.numpy())
  return any(correlation_tensor.numpy() > correlation)

def bigram_sentence_to_filter_cosine_similarity(sentence, filter: list, correlation=0.75):
  """Returns True if sentence contains bigrams similar to filter by measuring correlation between words and filter"""
  bigram = list(nltk.bigrams(sentence.split()))
  bigram = list(map(' '.join, bigram))
  correlation_tensor = tf.tensordot(uni_sentence_encoder(bigram), uni_sentence_encoder(filter), axes= [[1], [1]])
  #print(correlation_tensor.numpy())
  return any(correlation_tensor.numpy() > correlation)

In [None]:
#@title filter seafood
seafood_df = pd.DataFrame(similar_sentences(euclidean_distance_from_filter(encoded_sent_np, uni_sentence_encoder(["seafood"])), euclidean_dist=1.25), columns=['text'])

In [None]:
#seafood_mask_series = lemmatized_sent_df.review_lemma.apply(sentence_to_filter_cosine_similarity, args=(['seafood'],0.7))
seafood_mask_series = seafood_df.text.apply(sentence_to_filter_cosine_similarity, args=(['seafood'],0.7))

In [None]:
seafood_df[seafood_mask_series.to_numpy()].text.to_list()

In [None]:
#@title filter "happy hour"
happy_hour_df = pd.DataFrame(similar_sentences(euclidean_distance_from_filter(encoded_sent_np, uni_sentence_encoder(["happy hour"])), euclidean_dist=1.1), columns=['text'])

In [None]:
happy_hour_mask_series = happy_hour_df.text.apply(bigram_sentence_to_filter_cosine_similarity, args=(['happy hour'], 0.9))

In [None]:
happy_hour_df[happy_hour_mask_series.to_numpy()].text.to_list()

In [None]:
#@title filter ambience
ambience_df = pd.DataFrame(similar_sentences(euclidean_distance_from_filter(encoded_sent_np, uni_sentence_encoder(["ambience"])), euclidean_dist=1.25), columns=['text'])

In [None]:
ambience_mask_series = ambience_df.text.apply(sentence_to_filter_cosine_similarity, args=(['ambience'],0.7))

In [None]:
ambience_df[ambience_mask_series.to_numpy()].text.to_list()

In [None]:
#@title filter food from food_df
food_mask_series = lemmatized_food_df.review_lemma.apply(sentence_to_filter_cosine_similarity, args=(['food'],0.7))

In [None]:
food_mask_series.describe()

In [None]:
lemmatized_food_df[food_mask_series.to_numpy()]

In [None]:
phrase_lst = []

In [None]:
string = "the Waffles"

In [None]:
string.lower()

In [None]:
import spacy

In [None]:
#@title Create instance of spacy
spacy_nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
#@title Lemmatize the review sentences
def lemmatize_sentence(sentence):
  sentence_lemma = spacy_nlp(sentence)
  return " ".join([token.text.lower() if token.lemma_ == '-PRON-' else token.lemma_ for token in sentence_lemma])

In [None]:
lemmatize_sentence(string.lower())