In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
sb.set(style="darkgrid")
from operator import add

In [32]:
import findspark
findspark.init()
findspark.find()
import itertools
import pyspark
import sys
import time
import json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions  import date_format

from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import CountVectorizer

from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel

from wordcloud import WordCloud

In [3]:
spark = SparkSession \
    .builder \
    .appName("Yelp Recommender system") \
    .getOrCreate()

sqlContext = SQLContext(spark)

In [5]:
business_df=spark.read.json('business.json')
user_df=spark.read.json('user.json')
review_df=spark.read.json('train_review (1).json')

In [6]:
user_avg = spark.sparkContext.textFile('user_avg.json').persist().map(lambda x:json.loads(x))
user_avg_dict=user_avg.take(1)
user_avg_dict= list(map(list, user_avg_dict[0].items()))
user_avg_df = spark.createDataFrame(user_avg_dict, ["user_id", "average_stars"])

business_avg = spark.sparkContext.textFile('business_avg.json').persist().map(lambda x:json.loads(x))
business_avg_dict=business_avg.take(1)
business_avg_dict= list(map(list, business_avg_dict[0].items()))
business_avg_df = spark.createDataFrame(business_avg_dict, ["business_id", "average_stars"])

In [31]:
user_avg_df.count()

36590

In [32]:
user_df.count()

91730

In [34]:
business_df=business_df.join(business_avg_df,on='business_id',how='inner')

In [7]:
print('The schema of the business json file is as follows')
business_df.printSchema()
print('\n\n')
print('The schema of the user json file is as follows')
user_df.printSchema()
print('\n\n')
print('The schema of the review json file is as follows')
review_df.printSchema()
print('\n\n')
print('The schema of the business average json file is as follows')
business_avg_df.printSchema()
print('\n\n')
print('The schema of the user average json file is as follows')
user_avg_df.printSchema()
print('\n\n')

The schema of the business json file is as follows
root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: str

In [8]:
print('The count of the records in business dataframe are')
print(business_df.count())
print('\n')
print('The count of the records in users dataframe are')
print(user_df.count())
print('\n')
print('The count of the records in reviews dataframe are')
print(review_df.count())
print('\n')
print('The count of the records in user average rating dataframe are')
print(user_avg_df.count())
print('\n')
print('The count of the records in business average rating dataframe are')
print(business_avg_df.count())
print('\n')

The count of the records in business dataframe are
13167


The count of the records in users dataframe are
91730


The count of the records in reviews dataframe are
1029758


The count of the records in user average rating dataframe are
36590


The count of the records in business average rating dataframe are
13151




In [9]:
print('The sample business spark dataframe looks as')
print(business_df.show(1))
print('\n')
print('The sample users spark dataframe looks as')
print(business_df.show(1))
print('\n')
print('The sample reviews spark dataframe looks as')
print(business_df.show(1))
print('\n')

The sample business spark dataframe looks as
+--------------------+--------------------+--------------------+--------------------+---------+-----+-------+----------+------------+--------------------+-----------+-----+
|             address|          attributes|         business_id|          categories|     city|hours|is_open|  latitude|   longitude|                name|postal_code|state|
+--------------------+--------------------+--------------------+--------------------+---------+-----+-------+----------+------------+--------------------+-----------+-----+
|1775 E Tropicana ...|[,, u'full_bar', ...|PZ-LZzSlhSe9utkQY...|Restaurants, Italian|Las Vegas| null|      0|36.1000163|-115.1285285|Carluccio's Tivol...|      89119|   NV|
+--------------------+--------------------+--------------------+--------------------+---------+-----+-------+----------+------------+--------------------+-----------+-----+
only showing top 1 row

None


The sample users spark dataframe looks as
+----------------

In [10]:
#SQL table creation
business_df.createOrReplaceTempView('businesses')
user_df.createOrReplaceTempView('users')
review_df.createOrReplaceTempView('reviews')

In [11]:
query1="""
SELECT * FROM businesses LIMIT 1
"""
sqlContext.sql(query1).toPandas()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,state
0,"1775 E Tropicana Ave, Ste 29","(None, None, u'full_bar', {'romantic': True, '...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,,0,36.100016,-115.128529,Carluccio's Tivoli Gardens,89119,NV


In [48]:
query2="""
SELECT * FROM users LIMIT 1
"""
sqlContext.sql(query2).toPandas()

Unnamed: 0,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,name,useful,user_id,yelping_since
0,0,0,0,0,0,0,1,0,0,0,0,10,,0,"4N-HU_T32hLENLntsNKNBg, pSY2vwWLgWfGVAAiKQzMng...",8,David,28,bc8C_eETBWL0olvFSJJd0w,2013-10-04 00:16:10


In [49]:
query3="""
SELECT * FROM reviews LIMIT 1
"""
sqlContext.sql(query3).toPandas()

Unnamed: 0,business_id,date,review_id,stars,text,user_id
0,CalbswWuvigtvE2GvP62bw,2017-11-01 16:56:16,WdHD4FF40jKxLo9iNiNCSw,4.0,When I first heard about the title of this new...,6pzA8EDhHgW3A5-rBObhBA


In [50]:
query="""
SELECT count(DISTINCT business_id) as business_count FROM businesses
"""
print(sqlContext.sql(query).toPandas())

query="""
SELECT count(DISTINCT user_id) as user_count FROM users
"""
print(sqlContext.sql(query).toPandas())

query="""
SELECT count(DISTINCT review_id) as review_count FROM reviews
"""
print(sqlContext.sql(query).toPandas())

   business_count
0           13149
   user_count
0       91730
   review_count
0       1029758


In [58]:
query2="""
SELECT * FROM reviews LIMIT 1
"""
sqlContext.sql(query2).toPandas()

Unnamed: 0,business_id,date,review_id,stars,text,user_id
0,CalbswWuvigtvE2GvP62bw,2017-11-01 16:56:16,WdHD4FF40jKxLo9iNiNCSw,4.0,When I first heard about the title of this new...,6pzA8EDhHgW3A5-rBObhBA


In [67]:
#useful reviewers for restaurants
query="""
SELECT stars, count(*) as count FROM reviews GROUP BY stars ORDER BY count DESC
"""
sqlContext.sql(query).toPandas()

Unnamed: 0,stars,count
0,5.0,436571
1,4.0,260408
2,3.0,136177
3,1.0,113152
4,2.0,83450


In [71]:
#useful reviewers list
query="""
SELECT user_id,name,yelping_since,useful FROM users ORDER BY useful DESC LIMIT 5
"""
sqlContext.sql(query).toPandas()

Unnamed: 0,user_id,name,yelping_since,useful
0,--2vR0DIsmQ6WfcSzKWigw,Harald,2012-11-27 14:19:33,154202
1,W7DHyQlY_kXls2iXt-_2Ag,Maggie,2008-11-30 02:47:32,89792
2,ax7SnXOTIpatbsmqHLqVow,Rohlin,2010-01-11 01:50:32,81003
3,8k3aO-mPeyhbR5HUucA5aA,Victor,2007-12-08 14:56:45,79512
4,eKUGKQRE-Ywi5dY55_zChg,Cherylynn,2008-01-20 06:27:03,78888


In [62]:
#ranking of top 10 reviewers by count

query=
"""
SELECT u.user_id,u.name,u.yelping_since,u.useful,r.total_reviews 
FROM users u
INNER JOIN(
SELECT user_id,count(*) as total_reviews 
FROM reviews 
WHERE business_id in(
SELECT business_id 
FROM businesses 
where FIND_IN_SET('Restaurants',categories)>0 )
GROUP BY user_id ) r  
ON u.user_id=r.user_id
ORDER BY r.total_reviews DESC LIMIT 10
"""
sqlContext.sql(query).toPandas()

Unnamed: 0,user_id,name,yelping_since,useful,total_reviews
0,bLbSNkLggFnqwNNzzq-Ijw,Stefany,2011-06-29 14:40:01,44667,242
1,PKEzKWv_FktMm2mGPjwd0Q,Norm,2008-12-12 02:30:13,18768,169
2,UYcmGbelzRa0Q6JqzLoguw,Emily,2010-10-18 17:09:38,10550,132
3,3nDUQBjKyVor5wV0reJChg,Nelson,2008-12-05 19:18:51,11908,124
4,L8P5OWO1Jh4B2HLa1Fnbng,Westie,2007-10-03 18:09:28,1338,123
5,U4INQZOPSUaj8hMjLlZ3KA,Michael,2008-01-31 02:55:30,22757,116
6,_VMGbmIeK71rQGwOBWt_Kg,Chris,2007-08-12 22:57:19,6632,113
7,n86B7IkbU20AkxlFX_5aew,Jade,2009-04-28 04:02:58,6243,110
8,0tvCcnfJnSs55iB6mqPk3w,Daniel,2011-02-06 16:43:56,11432,93
9,C2C0GPKvzWWnP57Os9eQ0w,Clint,2009-06-24 00:20:54,5838,89


In [85]:
#ranking of top 10 reviewers by usefulness
query="""
SELECT u.user_id,u.name,u.yelping_since,u.useful,r.total_reviews 
FROM users u
INNER JOIN(
SELECT user_id,count(*) as total_reviews 
FROM reviews 
WHERE business_id in(
SELECT business_id 
FROM businesses 
where FIND_IN_SET('Restaurants',categories)>0 )
GROUP BY user_id ) r  
ON u.user_id=r.user_id
ORDER BY u.useful DESC LIMIT 5
"""
sqlContext.sql(query).toPandas()

Unnamed: 0,user_id,name,yelping_since,useful,total_reviews
0,--2vR0DIsmQ6WfcSzKWigw,Harald,2012-11-27 14:19:33,154202,3
1,W7DHyQlY_kXls2iXt-_2Ag,Maggie,2008-11-30 02:47:32,89792,8
2,ax7SnXOTIpatbsmqHLqVow,Rohlin,2010-01-11 01:50:32,81003,1
3,8k3aO-mPeyhbR5HUucA5aA,Victor,2007-12-08 14:56:45,79512,1
4,eKUGKQRE-Ywi5dY55_zChg,Cherylynn,2008-01-20 06:27:03,78888,8


In [81]:
#top 10 restaurants in the state Las Vegas
query="""
SELECT b.business_id,b.name,b.city,b.state,r.count,int(b.average_stars) 
FROM businesses b
INNER JOIN(
SELECT business_id, count(*) as count
FROM reviews
GROUP BY business_id
) r ON b.business_id=r.business_id 
WHERE FIND_IN_SET('Restaurants',categories)>0 AND b.city='Las Vegas'
ORDER BY r.count DESC LIMIT 10
"""
sqlContext.sql(query).toPandas()

Unnamed: 0,business_id,name,city,state,count,average_stars
0,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,Las Vegas,NV,3295,3
1,cYwJA2A6I12KNkm2rtXd5g,Gordon Ramsay BurGR,Las Vegas,NV,1930,3
2,faPVqws-x-5k2CQKDNtHxw,Yardbird Southern Table & Bar,Las Vegas,NV,1160,4
3,awI4hHMfa7H0Xf0-ChU5hg,The Oyster Bar,Las Vegas,NV,1130,4
4,ECOkEVUodMLUxvI0PMI4gQ,TAO Nightclub,Las Vegas,NV,1116,3
5,JyxHvtj-syke7m9rbza7mA,Sushi House Goyemon,Las Vegas,NV,1079,3
6,LR0qF0FEVsCOhYWUOiH26A,The Buffet at ARIA,Las Vegas,NV,1050,3
7,ii8sAGBexBOJoYRFafF9XQ,Paris Las Vegas Hotel & Casino,Las Vegas,NV,988,3
8,gx2yPrOJSwF1ApJYdGBWIw,JINYA Ramen Bar,Las Vegas,NV,893,4
9,Fi-2ruy5x600SX4avnrFuA,Spice Market Buffet,Las Vegas,NV,879,3


In [83]:
#top cities with higher review count
query="""
SELECT b.city, count(*) AS total_businesses
FROM businesses b
GROUP BY b.city
ORDER BY total_businesses DESC LIMIT 5
"""
sqlContext.sql(query).toPandas()

Unnamed: 0,city,total_businesses
0,Las Vegas,10814
1,Henderson,1713
2,North Las Vegas,481
3,Boulder City,60
4,LAS VEGAS,12


In [19]:
#top 10 restaurants with high five star counts
query="""
SELECT r.business_id,b.name, count(*) as five_star_count 
FROM reviews r
INNER JOIN
businesses b
ON b.business_id=r.business_id
WHERE r.stars>4.0
GROUP BY r.business_id,b.name
ORDER BY count(*) DESC
LIMIT 10
"""
sqlContext.sql(query).toPandas()

#used while building word clouds
top_business_df=sqlContext.sql(query).toPandas()

In [107]:
monthly_review_stats_df=review_df.select(date_format("date","MM/yyyy").alias("review_month"),"stars")\
.groupBy("review_month","stars").count()\
.orderBy("review_month","stars",ascending=False)
monthly_review_stats_df.show(10)

+------------+-----+-----+
|review_month|stars|count|
+------------+-----+-----+
|     12/2017|  5.0| 6713|
|     12/2017|  4.0| 2804|
|     12/2017|  3.0| 1393|
|     12/2017|  2.0|  910|
|     12/2017|  1.0| 1674|
|     12/2016|  5.0| 5867|
|     12/2016|  4.0| 2613|
|     12/2016|  3.0| 1281|
|     12/2016|  2.0|  881|
|     12/2016|  1.0| 1678|
+------------+-----+-----+
only showing top 10 rows



In [126]:
query="""
SELECT date_format(date,'yyyy-MM') as month_reviews,stars,COUNT(*) as star_count
FROM reviews
GROUP BY date_format(date,'yyyy-MM'),stars
ORDER BY 1,2
"""
monthly_review_stats=sqlContext.sql(query).toPandas()
monthly_review_stats.tail(10)

Unnamed: 0,month_reviews,stars,star_count
798,2018-10,1.0,1688
799,2018-10,2.0,817
800,2018-10,3.0,1186
801,2018-10,4.0,2408
802,2018-10,5.0,6171
803,2018-11,1.0,700
804,2018-11,2.0,390
805,2018-11,3.0,585
806,2018-11,4.0,1031
807,2018-11,5.0,2772


## DATA VISUALISATIONS

In [None]:
# Many more visualizations to come



## TEXT MINING AND ANALYTICS

In [12]:
reviews_rdd=review_df.rdd

In [13]:
#rdd level
print('Printed below is the review given to a business_id given by a user')
reviews_rdd.map(lambda x:(x['business_id'],x['text'])).take(1)

Printed below is the review given to a business_id given by a user


[('CalbswWuvigtvE2GvP62bw',
  'When I first heard about the title of this new show at Rio from my family. It instantly got me thinking about the, Wonderful World of Color Show at Disney California Adventure. I was excited and curious how this show would turn out to be. Considering there is not a lot of advertisements about this show. \n\nMy family and I got the opportunity to check this show on Sunday 10/29. We were quite surprised that the venue is smaller than most theaters on the Strip but we were pleased that they created an intimate setting with large HD screens that made up half of the venue along the stage and walls. Making images appear larger than life and visually stunning.\n\nEach performance was great and a lot of talented performers from acrobatic stunts, juggling, roller skating and various other acts that keeps you engage and entertained. The use of pop culture music and visual aid provided a nice backdrop that immerse you in each location as if your transported to vario

In [14]:
#merging all the reviews with respect to business ids
merged_rdd=reviews_rdd.map(lambda x:(x['business_id'],x['text'])).reduceByKey(add)
merged_review_df=merged_rdd.toDF(['business_id','merged_text'])

print('Merged text of all reviews of a single business_id is as follows')
reviews_rdd.map(lambda x:(x['business_id'],x['text'])).reduceByKey(add).take(1)

Merged text of all reviews of a single business_id is as follows


[('CalbswWuvigtvE2GvP62bw',
  'When I first heard about the title of this new show at Rio from my family. It instantly got me thinking about the, Wonderful World of Color Show at Disney California Adventure. I was excited and curious how this show would turn out to be. Considering there is not a lot of advertisements about this show. \n\nMy family and I got the opportunity to check this show on Sunday 10/29. We were quite surprised that the venue is smaller than most theaters on the Strip but we were pleased that they created an intimate setting with large HD screens that made up half of the venue along the stage and walls. Making images appear larger than life and visually stunning.\n\nEach performance was great and a lot of talented performers from acrobatic stunts, juggling, roller skating and various other acts that keeps you engage and entertained. The use of pop culture music and visual aid provided a nice backdrop that immerse you in each location as if your transported to vario

In [15]:
#tokenizing all reviews of businesses
regexTokenizer = RegexTokenizer(inputCol = 'merged_text', gaps = False, pattern = '\w+',outputCol = 'tokens_created')
token_review_df=regexTokenizer.transform(merged_review_df)
token_review_df.show(5)

+--------------------+--------------------+--------------------+
|         business_id|         merged_text|      tokens_created|
+--------------------+--------------------+--------------------+
|CalbswWuvigtvE2Gv...|When I first hear...|[when, i, first, ...|
|HhLVQg2DAmMll14mN...|I have been here ...|[i, have, been, h...|
|YC8Llp3jlgKh8b-cz...|All I have to say...|[all, i, have, to...|
|_8VkSb_Mryb6bHQxS...|The new location ...|[the, new, locati...|
|xOZd6roDjtb_ILWEw...|We had photos tak...|[we, had, photos,...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [16]:
#stopword removal in text
remover = StopWordsRemover(inputCol="tokens_created", outputCol="filtered_tokens")
filtered_df=remover.transform(token_review_df)
filtered_df.show(5)

+--------------------+--------------------+--------------------+--------------------+
|         business_id|         merged_text|      tokens_created|     filtered_tokens|
+--------------------+--------------------+--------------------+--------------------+
|CalbswWuvigtvE2Gv...|When I first hear...|[when, i, first, ...|[first, heard, ti...|
|HhLVQg2DAmMll14mN...|I have been here ...|[i, have, been, h...|[many, times, nev...|
|YC8Llp3jlgKh8b-cz...|All I have to say...|[all, i, have, to...|[say, one, word, ...|
|_8VkSb_Mryb6bHQxS...|The new location ...|[the, new, locati...|[new, location, o...|
|xOZd6roDjtb_ILWEw...|We had photos tak...|[we, had, photos,...|[photos, taken, d...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [17]:
#vectorization of features
count_Vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorized_features")
countVectorizer_model = count_Vectorizer.fit(filtered_df)
vectorized_df = countVectorizer_model.transform(filtered_df)
vectorized_df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|         merged_text|      tokens_created|     filtered_tokens| vectorized_features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|CalbswWuvigtvE2Gv...|When I first hear...|[when, i, first, ...|[first, heard, ti...|(262144,[0,1,3,4,...|
|HhLVQg2DAmMll14mN...|I have been here ...|[i, have, been, h...|[many, times, nev...|(262144,[0,1,2,3,...|
|YC8Llp3jlgKh8b-cz...|All I have to say...|[all, i, have, to...|[say, one, word, ...|(262144,[0,1,2,3,...|
|_8VkSb_Mryb6bHQxS...|The new location ...|[the, new, locati...|[new, location, o...|(262144,[0,1,2,3,...|
|xOZd6roDjtb_ILWEw...|We had photos tak...|[we, had, photos,...|[photos, taken, d...|(262144,[0,1,3,4,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [18]:
#tf-idf featurization
iDF = IDF(inputCol="vectorized_features", outputCol="inverseDoc_freq_vec")
idf_model = iDF.fit(vectorized_df)
tfiDF_df = idf_model.transform(vectorized_df) 
tfiDF_df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|         merged_text|      tokens_created|     filtered_tokens| vectorized_features| inverseDoc_freq_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|CalbswWuvigtvE2Gv...|When I first hear...|[when, i, first, ...|[first, heard, ti...|(262144,[0,1,3,4,...|(262144,[0,1,3,4,...|
|HhLVQg2DAmMll14mN...|I have been here ...|[i, have, been, h...|[many, times, nev...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|
|YC8Llp3jlgKh8b-cz...|All I have to say...|[all, i, have, to...|[say, one, word, ...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|
|_8VkSb_Mryb6bHQxS...|The new location ...|[the, new, locati...|[new, location, o...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|
|xOZd6roDjtb_ILWEw...|We had photos tak...|[we, had, photos,...|[photos, taken, d...|(262144,[0,1,3,4,..

In [None]:
##takes too much of time to run
##better dont run it now

#word2vec model
word2Vec_init = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'filtered_tokens', outputCol = 'word2vec')
word2Vec_model = word2Vec_init.fit(filtered_df)
word2Vec_model = Word2Vec_model.load('models/' + 'word2Vec')
word2Vec_model.transform(tfiDF_df).select('business_id','filtered_tokens','word2vec').show(5)

## WORD CLOUD BUILDING

In [30]:
def word_cloud(text):
    wordcloud = WordCloud().generate(text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [None]:
count=0
for business_id in top_business_df['business_id']:
    if count<5:
        text=''.join(tfiDF_df.filter(tfiDF_df.business_id=='4JNXUYY8wbaaDmk3BPzlWw').select('filtered_tokens').rdd.take(1)[0][0])
        print('The word cloud of business_id '+business_id+' is:')
        word_cloud(text)
        count+=1

The word cloud of business_id4JNXUYY8wbaaDmk3BPzlWwis:


In [None]:
#cosine similarity
def CosineSimilarity(vector1, vector2):
    value1=np.dot(vector1, vector2)
    value2=np.sqrt(np.dot(vector1, vector1))
    value3=np.sqrt(np.dot(vector2, vector2))
    return value1 / value2 / value3

In [None]:
#check similarity between business_ids
def check_similarity(business_id1,business_id2):
    #merged review text of businesses
    id1_review_text=''.join(tfiDF_df.filter(tfiDF_df.business_id==business_id1).select('filtered_tokens').rdd.take(1)[0][0])
    id2_review_text=''.join(tfiDF_df.filter(tfiDF_df.business_id==business_id2).select('filtered_tokens').rdd.take(1)[0][0])
    
    #word cloud formation
    word_cloud(id1_review_text)
    word_cloud(id2_review_text)
    
    #iDF vector
    vec1=tfiDF_df.filter(tfiDF_df.business_id==business_id1).select('inverseDoc_freq_vec').rdd.take(1)[0][0]
    vec2=tfiDF_df.filter(tfiDF_df.business_id==business_id2).select('inverseDoc_freq_vec').rdd.take(1)[0][0]
    
    #Dense vector formation
    dense_vec1=DenseVector(vec1.toArray())
    dense_vec2=DenseVector(vec2.toArray())
    
    print("cosine similarity of the given businesses based on idf vectors is: "+str(CosineSimilarity(dense_vec1,dense_vec2)))