In [11]:
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, udf, explode, monotonically_increasing_id

In [2]:
# create session
spark = SparkSession.builder \
    .master("local") \
    .appName("SPADE") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.memory", "6g") \
    .getOrCreate()

# create context
sc = spark.sparkContext

## Read

In [3]:
review_data = spark.read.text('reviews_sample.txt')
review_data.show(5)

+--------------------+
|               value|
+--------------------+
|hoagie institutio...|
|excellent food su...|
|yes place little ...|
|food great best t...|
|checked place pas...|
+--------------------+
only showing top 5 rows



In [5]:
# split into list of words and add sid column

review_split = review_data.withColumn('value', split(review_data.value, ' '))\
.select('value', monotonically_increasing_id().alias('sid'))
review_split.show()

+--------------------+---+
|               value|sid|
+--------------------+---+
|[hoagie, institut...|  0|
|[excellent, food,...|  1|
|[yes, place, litt...|  2|
|[food, great, bes...|  3|
|[checked, place, ...|  4|
|[wing, sauce, lik...|  5|
|[cold, cheap, bee...|  6|
|[highly, recommen...|  7|
|[big, believer, f...|  8|
|[decent, range, s...|  9|
|[owning, driving,...| 10|
|[place, absolute,...| 11|
|[finally, made, r...| 12|
|[drove, yesterday...| 13|
|[thank, rob, trul...| 14|
|[waiting, almost,...| 15|
|[visited, store, ...| 16|
|[fianc, upgraded,...| 17|
|[waited, min, peo...| 18|
|[place, delicious...| 19|
+--------------------+---+
only showing top 20 rows



In [25]:
# create sid index

cols = ('value', 'sid')
review_explode = review_split.select(*cols, explode('value').alias('word') )
review_explode.select(*tuple(review_explode.columns)).show()

+--------------------+---+-----------+
|               value|sid|       word|
+--------------------+---+-----------+
|[hoagie, institut...|  0|     hoagie|
|[hoagie, institut...|  0|institution|
|[hoagie, institut...|  0|    walking|
|[hoagie, institut...|  0|        doe|
|[hoagie, institut...|  0|       seem|
|[hoagie, institut...|  0|       like|
|[hoagie, institut...|  0|  throwback|
|[hoagie, institut...|  0|       year|
|[hoagie, institut...|  0|        ago|
|[hoagie, institut...|  0|        old|
|[hoagie, institut...|  0|  fashioned|
|[hoagie, institut...|  0|       menu|
|[hoagie, institut...|  0|      board|
|[hoagie, institut...|  0|      booth|
|[hoagie, institut...|  0|      large|
|[hoagie, institut...|  0|  selection|
|[hoagie, institut...|  0|       food|
|[hoagie, institut...|  0| speciality|
|[hoagie, institut...|  0|    italian|
|[hoagie, institut...|  0|     hoagie|
+--------------------+---+-----------+
only showing top 20 rows



In [26]:
# create eid by using window
from pyspark.sql.functions import row_number
from pyspark.sql import Window

w = Window.partitionBy("value").orderBy("word")
review_enumerate = review_explode.withColumn("value", row_number().over(w))
review_enumerate.show()

+-----+----+----------+
|value| sid|      word|
+-----+----+----------+
|    1|5776|  adequate|
|    2|5776|      also|
|    3|5776|      area|
|    4|5776|       ask|
|    5|5776|      back|
|    6|5776| beautiful|
|    7|5776|      best|
|    8|5776|      best|
|    9|5776|      best|
|   10|5776|    blonde|
|   11|5776|    brassy|
|   12|5776|    bumble|
|   13|5776|    bumble|
|   14|5776|      came|
|   15|5776|      came|
|   16|5776|     could|
|   17|5776|  customer|
|   18|5776|     decor|
|   19|5776|definetely|
|   20|5776|definetely|
+-----+----+----------+
only showing top 20 rows



In [24]:
# check
data = pd.read_table('reviews_sample.txt', header=None)
np.sort(data[0][5776].split(' '))

array(['adequate', 'also', 'area', 'ask', 'back', 'beautiful', 'best',
       'best', 'best', 'blonde', 'brassy', 'bumble', 'bumble', 'came',
       'came', 'could', 'customer', 'decor', 'definetely', 'definetely',
       'desired', 'desk', 'experience', 'experience', 'experience',
       'fixed', 'front', 'good', 'good', 'great', 'hair', 'handled',
       'high', 'instead', 'jim', 'job', 'kerastase', 'large', 'league',
       'like', 'like', 'little', 'mixed', 'much', 'new', 'new',
       'nicolette', 'one', 'orange', 'others', 'overall', 'owner', 'par',
       'people', 'people', 'poorly', 'price', 'product', 'push', 'result',
       'service', 'something', 'space', 'sub', 'thing', 'think',
       'treatment', 'tried', 'value', 'year', 'york'], dtype='<U10')

In [32]:
# group by word and cut length
from pyspark.sql.functions import countDistinct, lit

def to_list(x,y):
    return list(x,y)

to_list_udf = udf(to_list)

review_enumerate.groupby('word').apply()

ValueError: Invalid udf: the udf argument must be a pandas_udf of type GROUPED_MAP.

In [None]:
spark.stop()
sc.stop()