In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425350 sha256=9cce40fd41535eb052356265ee8fa84a51366fb18fc41abb14738267a037cf43
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [56]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.types import IntegerType, FloatType, StringType
from pyspark.sql.functions import (
    col, udf, count, length, when, isnan, split, size,
    array_min, array_max, array_distinct, to_timestamp,
    lower, regexp_replace, concat_ws
)

import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
spark = SparkSession \
    .builder \
    .appName("Yelp Review Sense") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/13 21:08:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [58]:
column_order = [
    "review_id",
    "text",
]

master = spark.read.json("master.json")
df = master.select(column_order)
df.show(5, 80)



+----------------------+--------------------------------------------------------------------------------+
|             review_id|                                                                            text|
+----------------------+--------------------------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to take about 2 hours fr...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and nothing compares to the ...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a large chicken leg, fried...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korm...|
|Sx8TMOWLNuJBWer-0pcmoA|Cute interior and owner (?) gave us tour of upcoming patio/rooftop area which...|
+----------------------+--------------------------------------------------------------------------------+
only showing top 5 rows



                                                                                

## Text Cleaning and Preprocessing

Convert text to lowercase

In [38]:
df = df.withColumn("cleaned_text", lower(col("text")))
df.show(10, 60)

+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|             review_id|                                                        text|                                                cleaned_text|
+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to t...|if you decide to eat here, just be aware it is going to t...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and noth...|i've taken a lot of spin classes over the years, and noth...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a larg...|family diner. had the buffet. eclectic assortment: a larg...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the...|wow!  yummy, different,  deliciou

Remove special characters and punctuation

In [39]:
df = df.withColumn("cleaned_text", regexp_replace(col("cleaned_text"), '[' + re.escape(string.punctuation) + ']', ''))
df.show(10, 60)

+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|             review_id|                                                        text|                                                cleaned_text|
+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to t...|if you decide to eat here just be aware it is going to ta...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and noth...|ive taken a lot of spin classes over the years and nothin...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a larg...|family diner had the buffet eclectic assortment a large c...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the...|wow  yummy different  delicious  

Remove stopwords

In [40]:
# Tokenize the 'cleaned_text' column into a new 'tokens' column
df = Tokenizer(inputCol="cleaned_text", outputCol="tokens").transform(df)

# Remove stopwords
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df = remover.transform(df)

# Join the 'filtered_tokens' back into a text, overwriting the 'cleaned_text' column
df = df.withColumn("cleaned_text", concat_ws(" ", col("filtered_tokens")))
df = df.drop("tokens", "filtered_tokens")

df.show(10, 60)

+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|             review_id|                                                        text|                                                cleaned_text|
+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to t...|decide eat aware going take 2 hours beginning end tried m...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and noth...|ive taken lot spin classes years nothing compares classes...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a larg...|family diner buffet eclectic assortment large chicken leg...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the...|wow  yummy different  delicious  

Remove emojis and emoticons

In [46]:
emoji_pattern = r'[' \
                r'\\U0001F600-\\U0001F64F' \
                r'\\U0001F300-\\U0001F5FF' \
                r'\\U0001F680-\\U0001F6FF' \
                r'\\U0001F700-\\U0001F77F' \
                r'\\U0001F780-\\U0001F7FF' \
                r'\\U0001F800-\\U0001F8FF' \
                r'\\U0001F900-\\U0001F9FF' \
                r'\\U0001FA00-\\U0001FA6F' \
                r'\\U0001FA70-\\U0001FAFF' \
                r'\\U0001FB00-\\U0001FBFF' \
                r'\\U0001FC00-\\U0001FCFF' \
                r'\\U0001FD00-\\U0001FDFF' \
                r'\\U0001FE00-\\U0001FEFF' \
                r'\\U0001FF00-\\U0001FFFF' \
                r']'

emoticon_pattern = r'[:;][-\'\)\(\]\[dDpPoO/\\|33*]+|<3'

df = df.withColumn("cleaned_text", regexp_replace(df["cleaned_text"], emoji_pattern, ""))
df = df.withColumn("cleaned_text", regexp_replace(df["cleaned_text"], emoticon_pattern, ""))

df.show(10, 60)

+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|             review_id|                                                        text|                                                cleaned_text|
+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to t...|decide eat aware going take  hours beginning end tried mu...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and noth...|ive taken lot spin classes years nothing compares classes...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a larg...|family diner buffet eclectic assortment large chicken leg...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the...|wow  yummy different  delicious  

Remove URLs and HTML Tags

In [48]:
html_pattern = r"<.*?>"
url_pattern = r"https?://\S+|www\.\S+"

df = df.withColumn("cleaned_text", regexp_replace(df["cleaned_text"], html_pattern, ""))
df = df.withColumn("cleaned_text", regexp_replace(df["cleaned_text"], url_pattern, ""))

df.show(10, 60)

+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|             review_id|                                                        text|                                                cleaned_text|
+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to t...|decide eat aware going take  hours beginning end tried mu...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and noth...|ive taken lot spin classes years nothing compares classes...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a larg...|family diner buffet eclectic assortment large chicken leg...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the...|wow  yummy different  delicious  

Limit consecutive whitespaces to 1

In [52]:
df = df.withColumn("cleaned_text", regexp_replace(df["cleaned_text"], r'\s+', ' '))
df.show(10, 60)

+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|             review_id|                                                        text|                                                cleaned_text|
+----------------------+------------------------------------------------------------+------------------------------------------------------------+
|KU_O5udG6zpxOg-VcAEodg|If you decide to eat here, just be aware it is going to t...|decide eat aware going take hours beginning end tried mul...|
|BiTunyQ73aT9WBnpR9DZGw|I've taken a lot of spin classes over the years, and noth...|ive taken lot spin classes years nothing compares classes...|
|saUsX_uimxRlCVr67Z4Jig|Family diner. Had the buffet. Eclectic assortment: a larg...|family diner buffet eclectic assortment large chicken leg...|
|AqPFMleE6RsU23_auESxiA|Wow!  Yummy, different,  delicious.   Our favorite is the...|wow yummy different delicious fav

In [59]:
columns_to_select = [
    "review_id",
    "cleaned_text",
]

df = df.select(columns_to_select)
master = master.join(df, on = 'review_id', how = 'inner')