# LDAS: Project - Team 2
Moritz Eck, moritz.eck.0055@student.uu.se<br>
Tyson McLeod, <br>
Isaline Baret, <br>
Markella-Achilleia Zacharouli, <br>

## Setup & Deploy

In [1]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import time

In [2]:
# start your application with dynamic allocation enabled, a timeout of no more than 30 seconds and a cap on CPU cores:
# REMOTE SESSION
# spark = SparkSession\
#        .builder\
#        .master("spark://192.168.1.153:7077") \
#        .appName("LDSA_Team2_Project")\
#        .config("spark.dynamicAllocation.enabled", True)\
#        .config("spark.shuffle.service.enabled", True)\
#        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
#        .config("spark.dynamicAllocation.maxExecutors", 8)\
# #.config("spark.executor.instances", 8)\ # set this to 1 if you want to compare remote execution with local.
#        .config("spark.executor.cores",8)\
#        .config('spark.executor.memory', "8g")\
#        .config("spark.driver.cores", 2)\
#        .config("spark.driver.memory", "2g")\
#        .config("spark.executor.heartbeatInterval","5s")\
#        .getOrCreate()

# LOCAL SESSION
spark = SparkSession\
    .builder.master("local[4]")\
    .appName("LDSA_Team2_Project")\
    .config("spark.executor.instances", 1)\
    .config("spark.executor.cores",4)\
    .config('spark.executor.memory', "8g")\
    .config("spark.driver.cores", 2)\
    .config("spark.driver.memory", "2g")\
    .config("spark.executor.heartbeatInterval","5s")\
    .getOrCreate()

# spark context (old RDD)
sc = spark.sparkContext

In [3]:
sc.setLogLevel("INFO")
LOCAL = True # TODO: select if the experiment is run remote or local

# filepaths
REMOTE_HDFS_PATH = "hdfs://192.168.1.153:9000/team02/input/"
LOCAL_PATH = "../"

# filenames
business_fn = "yelp_academic_dataset_business.json"
users_fn = "yelp_academic_dataset_user.json"
reviews_fn = "yelp_academic_dataset_review.json"

# create the filepaths (remote or local)
if LOCAL:
    business_fp = LOCAL_PATH + business_fn
    users_fp = LOCAL_PATH + users_fn
    reviews_fp = LOCAL_PATH + reviews_fn
else:
    business_fp = REMOTE_HDFS_PATH + business_fn
    users_fp = REMOTE_HDFS_PATH + users_fn
    reviews_fp = REMOTE_HDFS_PATH + reviews_fn

## Experiment 1: Business Data

### Load and Preprocess Data

In [14]:
# read JSON file into PySpark dataframe
business = spark.read.json(business_fp)

# the inferred schema can be visualized using the printSchema() method
#business.printSchema()

# show top 5 rows
#business.show(5)

In [15]:
# the number of rows
#print("Rows in Business Dataframe:\t", business.count())

# the number of RDD partitions
#print("Number of Partitions:\t\t", business.rdd.getNumPartitions())

### Experiment 1: Sort all business according to stars and review_count

In [16]:
times = []
for i in range(10):
    # top businesses according to starts and review_count
    start_time = time.time()
    filtered = business.filter(business.stars >= 4.0).sort("stars", "review_count", ascending=[0,0]).head(10)
    end_time = time.time()

    for row in filtered:
        name, stars, rc = row["name"], row["stars"], row["review_count"]
        #print("Name:\t{},\tStars:\t{},\tReview Count:\t{}".format(name[:12], stars, rc))
        
    delta = end_time - start_time
    times.append(delta)
    print("\nThe evaluation took: {} seconds".format(delta))
    
    # remove the storage and space
    del filtered, delta
    
print("Average Time Taken: {}".format(sum(times)/len(times)))


The evaluation took: 1.9836606979370117 seconds

The evaluation took: 0.8734118938446045 seconds

The evaluation took: 0.9278416633605957 seconds

The evaluation took: 0.9600551128387451 seconds

The evaluation took: 0.9410321712493896 seconds

The evaluation took: 0.9506406784057617 seconds

The evaluation took: 0.8186242580413818 seconds

The evaluation took: 0.9506490230560303 seconds

The evaluation took: 0.7654187679290771 seconds

The evaluation took: 0.7846441268920898 seconds
Average Time Taken: 0.9955978393554688


In [7]:
# removes the business dataframe (only do it if you need space)
del business 

## Experiment 2: User Data

### Load User Data & Preprocess

In [17]:
# LOCAL 
# read JSON file
users = spark.read.json(users_fp)

# the inferred schema can be visualized using the printSchema() method
#users.printSchema()

# the number of rows
#print("Rows in Users Dataframe:\t", users.count())

# the number of RDD partitions
#print("Number of Partitions:\t\t", users.rdd.getNumPartitions())

In [18]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, IntegerType

# use udf to define a row-at-a-time udf
def count_friends(line):
    # lowercase transformation
    # splitting into tokens/words
    return len(line.lower().split(', '))

# count the number of friends per user and add the value as a new column
count_friends = udf(count_friends, IntegerType())

### Experiment 2: Sorting dataset according "review_count", "useful", "fans" and couting the number of friends per reviewer and sorting according to the number.

In [19]:
times = []
for i in range(5):
    start_time = time.time()

    # sort according to reviewers with most reviews
    top_reviewers = users.sort("review_count", ascending=False).head(20)

    # sort according to reviewers with most useful reviews
    top_useful_reviews = users.sort("useful", "review_count", ascending=[0,0]).head(20)

    # sort according to reviewers with most fans
    top_fan_count = users.sort("fans", "useful", ascending=[0,0]).head(20)

    # count the number of friends per reviewer
    modified = users.withColumn("friendsCount", count_friends(col("friends")))

    # sort according to reviewers with most friends and then fans
    top_friends = modified.sort("friendsCount", "fans", ascending=[0,0]).head(20)

    end_time = time.time()
    delta = end_time - start_time
    times.append(delta)

    #print("Top 5 Reviewers by Review Count!")
    for row in top_reviewers[:5]:
        name, since, rc = row["name"], row["yelping_since"], row["review_count"]
        #print("Name:\t{}\tReview Count:\t{}\tYelping Since:\t{}".format(name, rc, since))

    #print("\nTop 5 Most Useful Reviews by Reviewer!")
    for row in top_useful_reviews[:5]:
        name, since, rc, useful = row["name"], row["yelping_since"], row["review_count"], row["useful"]
        #print("Name:\t{}\tUseful Reviews:\t{}\tReview Count:\t{}\tYelping Since:\t{}".format(name, useful, rc, since))

    #print("\nTop 5 Reviewers with most Fans!")
    for row in top_fan_count[:5]:
        name, since, rc, useful, fans = row["name"], row["yelping_since"], row["review_count"], row["useful"], row["fans"]
        #print("Name:\t{}\tFans:\t{}\tUseful Reviews:\t{}\tReview Count:\t{}\tYelping Since:\t{}".format(name, fans, useful, rc, since))

    #print("\nTop 5 Reviewers with most Friends!")
    for row in top_friends[:5]:
        name, since, fc, fans = row["name"], row["yelping_since"], row["friendsCount"], row["fans"]
        #print("Name:\t{}\tFriends:\t{}\tFans:\t{}\tYelping Since:\t{}".format(name, fans, fc, since))
    
    print("\nThe evaluation took: {:3.3f} seconds".format(end_time - start_time))
    del top_reviewers, top_useful_reviews, top_fan_count, modified, top_friends, delta
    
print("Average Time: {}".format(sum(times)/len(times)))


The evaluation took: 36.356 seconds

The evaluation took: 33.524 seconds

The evaluation took: 33.138 seconds

The evaluation took: 33.033 seconds

The evaluation took: 33.494 seconds
Average Time: 33.908977794647214


## Experiment 3: Reviews

### Load User Data & Preprocess

In [20]:
# LOCAL 
# read JSON file
reviews = spark.read.json(reviews_fp)

# the inferred schema can be visualized using the printSchema() method
reviews.printSchema()

# the number of rows
#rint("Rows in Reviews Dataframe:\t", reviews.count())

# the number of RDD partitions
#rint("Number of Partitions:\t\t", reviews.rdd.getNumPartitions())

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



### Experiment 3: Preprocessing of Reviews & Join with Businesses 

In [21]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import re    

# pre compile regex that match all non-ASCII characters
pattern = re.compile("\W")

# use udf to define a row-at-a-time udf
def preprocess(line):
    # lowercase transformation
    # splitting into tokens/words
    tokens = line.lower().split(' ')
    tokens = [pattern.sub("", token.strip()) for token in tokens]
    return str(tokens)

# tokenize preprocessing udf
tok = udf(preprocess, StringType())

times = []
for i in range(1):
    start_time = time.time()

    # preprocess reviews
    pr_reviews = reviews.withColumn("text", tok(col("text")))
    pr_reviews = pr_reviews.drop('user_id', 'review_id')
    pr_reviews = pr_reviews.withColumnRenamed("business_id", "bu_id")

    # print first three rows
    # pr_reviews.show(5, False)

    # join business and reviews
    merged = business.join(pr_reviews, business.business_id == pr_reviews.bu_id, 'left_outer')

    # drop duplicate column and filter out empty rows
    merged = merged.drop("bu_id").filter(merged.text.isNotNull())
    result = merged.select(["business_id", "categories", "text"]).head(20)

    end_time = time.time()
    delta = end_time - start_time
    times.append(delta)
    
    #print(result)

    print("\nThe evaluation took: {:3.3f} seconds\n".format(delta))
    del pr_reviews, merged, result, delta
    
print("Average Time: {}".format(sum(times)/len(times)))



The evaluation took: 3.760 seconds

Average Time: 3.759615421295166


### Experiment 4: Preprocessing of Reviews & creation of Ngrams for analysis

In [22]:
from pyspark.sql.functions import col, udf, regexp_replace, lower, split, size
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram
from nltk.stem.porter import *


# pre_process_text
def pre_process_text(x):
    x = lower(x)
    x = regexp_replace(x, "^rt ", "")
    x = regexp_replace(x, "[^a-zA-Z0-9\\s]", "")
    return x 


# stemmer function
def stem(input):
    output = []
    for x in input:
        stemmy = stemmer.stem(x)
        if(len(stemmy) > 2):
            output.append(stemmy)
    return output

start_time = time.time()


# pre_process_text df
pre_process_text_df = reviews.select(pre_process_text(col("text")).alias("text"))
pre_process_text_df.drop('user_id', 'review_id')


# tokenize text
tokenizer = Tokenizer(inputCol="text", outputCol="vector")
vector_df = tokenizer.transform(pre_process_text_df).select("vector")


# remove stopwords 
stopword_remover = StopWordsRemover()
stopwords = stopword_remover.getStopWords()
stopword_remover.setInputCol("vector")
stopword_remover.setOutputCol("text_without_stopwords")
text_without_stopwords_df = stopword_remover.transform(vector_df).select("text_without_stopwords")


# stem tokens: greatest should become great, stopping -> stop etc.
stemmer = PorterStemmer()


# udf for stemming 
udf_stemmy = udf(lambda x: stem(x), ArrayType(StringType()))


# new dataframe with vectors containing stemmed tokens
vector_with_stemmed_tokens = (
    text_without_stopwords_df
        .withColumn("text_stemmed", udf_stemmy("text_without_stopwords"))
        .select("text_stemmed")
    )

#vector_with_stemmed_tokens.printSchema()
#vector_with_stemmed_tokens.show(4, False)
# 12th row after a show(), walked around --> walk around after stem


# create ngrams 

# x = suitable number for reviews?
x = 3 

ngram = NGram(n=x, inputCol= "text_stemmed", outputCol="ngrams")
vector_with_stemmed_tokens= ngram.transform(vector_with_stemmed_tokens)

#vector_with_stemmed_tokens.printSchema()
#vector_with_stemmed_tokens.show(1, False)

ngrams_of_size_x = vector_with_stemmed_tokens.where(size(col("ngrams")) >= x)

#ngrams_of_size_x.printSchema()
#ngrams_of_size_x.show()

end_time = time.time()

print("\nThe evaluation took: {:3.3f} seconds".format(end_time - start_time))


The evaluation took: 0.173 seconds


## Shutdown

In [15]:
# release the cores for another application!
spark.stop()
sc.stop()