In [1]:
from pyspark.sql import SparkSession
from textblob import TextBlob

In [2]:
spark = SparkSession.builder.appName('Yelp').getOrCreate()
#sc.stop()

In [3]:
sc.stop()

In [4]:
sc = SparkContext("local[*]")
spark = SparkSession(sc).builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")


In [5]:
#1. Clean the dataset
data = spark.read.csv("C:\\Users\\alire\\Desktop\\Scalable\\Final\\yelp_review.csv",inferSchema=True,header=True)
#df = spark.read.format("csv").option("header", "true").option("multiline","true").load("C:\\Users\\alire\\Desktop\\Scalable\\yelp_review.csv")
data.printSchema()
data.show()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- cool: string (nullable = true)

+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+
|           review_id|             user_id|         business_id|stars|      date|                text|useful|funny|cool|
+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+
|vkVSCC7xljjrAI4UG...|bv2nCi5Qv5vroFiqK...|AEx2SYEUJmTxVVB18...|    5|2016-05-28|Super simple plac...|  null| null|null|
|Staff was very he...|                   0|                   0|    0|      null|                null|  null| null|null|
|n6QzIUObkYshz4dz2...|bv2nCi5Qv5vroFiqK...|VR6

In [6]:
data = data.withColumn("label", data["stars"].cast("double"))
data = data.dropna(subset=['label', 'text'])

data = data.select('text','label')
data=data.filter(data['label']<=5.0)
data=data.filter(data['label']>0.0)
data.show()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Super simple plac...|  5.0|
|Small unassuming ...|  5.0|
|Lester's is locat...|  5.0|
|Love coming here....|  4.0|
|Had their chocola...|  4.0|
|Cycle Pub Las Veg...|  5.0|
|Who would have gu...|  4.0|
|Always drove past...|  4.0|
|Not bad!! Love th...|  3.0|
|    Love this place!|  5.0|
|This is currently...|  4.0|
|Server was a litt...|  3.0|
|I thought Tidy's ...|  1.0|
|Wanted to check o...|  3.0|
|This place is awe...|  5.0|
|a must stop when ...|  4.0|
|I too have been t...|  1.0|
|Came here with my...|  3.0|
|Came here for a b...|  3.0|
|really excited to...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [7]:
print('Number of rows (original data)=',data.count())


Number of rows (original data)= 5415605


In [8]:
(training,testing) = data.randomSplit([0.903,0.097],seed=100)
testing.show()
print('Number of rows (Test dataset)=',testing.count())

+--------------------+-----+
|                text|label|
+--------------------+-----+
|   !!! STAY AWAY!!!!|  1.0|
|"$2 dollar tacos ...|  4.0|
|"$7 toast and $16...|  1.0|
|"...and if you ev...|  4.0|
|"0\/0 Stars. 6 mo...|  1.0|
|"1. Remodeled roo...|  3.0|
|"2.5 stars, reall...|  2.0|
|"3 of us missed o...|  2.0|
|"5 stars for the ...|  3.0|
|"A ""home cooked"...|  3.0|
|"A French inspire...|  3.0|
|"A bald headed fa...|  1.0|
|"A fellow Yelper ...|  3.0|
|"A friend and I h...|  5.0|
|"A friend and I w...|  5.0|
|"A good haircut i...|  5.0|
|"A good""away fro...|  4.0|
|"A great happy ho...|  3.0|
|"A little hard to...|  4.0|
|"A name that impl...|  2.0|
+--------------------+-----+
only showing top 20 rows

Number of rows (Test dataset)= 525908


In [10]:
from pyspark.sql.functions import length
testing = testing.withColumn('length',length(testing['text']))
testing.show()


+--------------------+-----+------+
|                text|label|length|
+--------------------+-----+------+
|   !!! STAY AWAY!!!!|  1.0|    17|
|"$2 dollar tacos ...|  4.0|   328|
|"$7 toast and $16...|  1.0|    35|
|"...and if you ev...|  4.0|    98|
|"0\/0 Stars. 6 mo...|  1.0|   482|
|"1. Remodeled roo...|  3.0|    80|
|"2.5 stars, reall...|  2.0|   193|
|"3 of us missed o...|  2.0|   574|
|"5 stars for the ...|  3.0|   286|
|"A ""home cooked"...|  3.0|    34|
|"A French inspire...|  3.0|   222|
|"A bald headed fa...|  1.0|   867|
|"A fellow Yelper ...|  3.0|   102|
|"A friend and I h...|  5.0|   268|
|"A friend and I w...|  5.0|   529|
|"A good haircut i...|  5.0|    81|
|"A good""away fro...|  4.0|    38|
|"A great happy ho...|  3.0|   178|
|"A little hard to...|  4.0|   208|
|"A name that impl...|  2.0|    82|
+--------------------+-----+------+
only showing top 20 rows



In [11]:
from pyspark.sql.functions import udf
pol = lambda x: round(TextBlob(x).sentiment.polarity,2)
sub = lambda x: round(TextBlob(x).sentiment.subjectivity,2)

pol = udf(pol)
sub= udf(sub)

In [12]:
df2 = testing.withColumn("Polarity", pol(testing['text'])).withColumn("Subjectivity", sub(testing['text']))

In [13]:
df2.show()

+--------------------+-----+------+--------+------------+
|                text|label|length|Polarity|Subjectivity|
+--------------------+-----+------+--------+------------+
|   !!! STAY AWAY!!!!|  1.0|    17|     0.0|         0.0|
|"$2 dollar tacos ...|  4.0|   328|    0.15|        0.58|
|"$7 toast and $16...|  1.0|    35|     0.0|         0.0|
|"...and if you ev...|  4.0|    98|    -0.1|         0.7|
|"0\/0 Stars. 6 mo...|  1.0|   482|   -0.21|        0.46|
|"1. Remodeled roo...|  3.0|    80|    0.57|        0.71|
|"2.5 stars, reall...|  2.0|   193|     0.4|        0.55|
|"3 of us missed o...|  2.0|   574|    -0.0|        0.61|
|"5 stars for the ...|  3.0|   286|    0.31|        0.53|
|"A ""home cooked"...|  3.0|    34|     0.0|         0.0|
|"A French inspire...|  3.0|   222|     0.1|        0.44|
|"A bald headed fa...|  1.0|   867|    0.01|        0.56|
|"A fellow Yelper ...|  3.0|   102|     0.0|        0.25|
|"A friend and I h...|  5.0|   268|    0.13|        0.32|
|"A friend and

In [14]:
df3=df2.select('text','label','length',df2.Polarity.cast('float').alias('Polarity'),df2.Subjectivity.cast('float').alias('Subjectivity'))
df3.show()

+--------------------+-----+------+--------+------------+
|                text|label|length|Polarity|Subjectivity|
+--------------------+-----+------+--------+------------+
|   !!! STAY AWAY!!!!|  1.0|    17|     0.0|         0.0|
|"$2 dollar tacos ...|  4.0|   328|    0.15|        0.58|
|"$7 toast and $16...|  1.0|    35|     0.0|         0.0|
|"...and if you ev...|  4.0|    98|    -0.1|         0.7|
|"0\/0 Stars. 6 mo...|  1.0|   482|   -0.21|        0.46|
|"1. Remodeled roo...|  3.0|    80|    0.57|        0.71|
|"2.5 stars, reall...|  2.0|   193|     0.4|        0.55|
|"3 of us missed o...|  2.0|   574|    -0.0|        0.61|
|"5 stars for the ...|  3.0|   286|    0.31|        0.53|
|"A ""home cooked"...|  3.0|    34|     0.0|         0.0|
|"A French inspire...|  3.0|   222|     0.1|        0.44|
|"A bald headed fa...|  1.0|   867|    0.01|        0.56|
|"A fellow Yelper ...|  3.0|   102|     0.0|        0.25|
|"A friend and I h...|  5.0|   268|    0.13|        0.32|
|"A friend and

In [15]:
df4=df3.groupby('label').mean()
df4.show()

+-----+----------+------------------+--------------------+------------------+
|label|avg(label)|       avg(length)|       avg(Polarity)| avg(Subjectivity)|
+-----+----------+------------------+--------------------+------------------+
|  1.0|       1.0|378.81585111920873|-0.04804177512916533|0.4594122845338699|
|  4.0|       4.0| 286.4443320089911|   0.272713684867052|0.5349620523030915|
|  3.0|       3.0|295.62965108793617| 0.16987527360706428|0.4897362229618307|
|  2.0|       2.0| 323.9365188742221| 0.07076164431219087|0.4640575800996781|
|  5.0|       5.0| 296.2475196653481| 0.35265225267831063|0.5659516656697704|
+-----+----------+------------------+--------------------+------------------+

