In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.13 (default, Dec 20 2016 23:05:08)
SparkSession available as 'spark'.


In [2]:
import pickle
path = "/Users/lakerwayne/Desktop/YelpChallenge"
phrases = pickle.load(open(path + '/word2one.pickle', 'rb'))

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf

cuisine_path = path + "/cuisines/review_Italian.txt"
reviews = []

with open(cuisine_path, 'r') as txtfile:
    rid = 0
    for line in txtfile.readlines():
        review = tuple([rid, line])
        reviews.append(review)
        rid += 1

sentenceDataFrame = spark.createDataFrame(reviews, ["id", "review"])
tokenizer = Tokenizer(inputCol="review", outputCol="words")
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.show(5)


+---+--------------------+--------------------+
| id|              review|               words|
+---+--------------------+--------------------+
|  0|Pallucci is a qua...|[pallucci, is, a,...|
|  1|Came for dinner, ...|[came, for, dinne...|
|  2|What a gem!!  My ...|[what, a, gem!!, ...|
|  3|3.5 stars! I went...|[3.5, stars!, i, ...|
|  4|Small, yet cozy r...|[small,, yet, coz...|
+---+--------------------+--------------------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
tokenized = remover.transform(tokenized)

In [5]:
tokenized.show(5)

+---+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|
+---+--------------------+--------------------+--------------------+
|  0|Pallucci is a qua...|[pallucci, is, a,...|[pallucci, quaint...|
|  1|Came for dinner, ...|[came, for, dinne...|[came, dinner,, $...|
|  2|What a gem!!  My ...|[what, a, gem!!, ...|[gem!!, , husband...|
|  3|3.5 stars! I went...|[3.5, stars!, i, ...|[3.5, stars!, wen...|
|  4|Small, yet cozy r...|[small,, yet, coz...|[small,, yet, coz...|
+---+--------------------+--------------------+--------------------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import struct
from pyspark.sql.types import *
import re

def cleanup_text(record):
    text = record[3]
    text_out = [re.sub('[^a-zA-Z0-9]','',word) for word in text]
    return text_out

# define udf with an array of tokenized words
udf_cleantext = udf(cleanup_text , ArrayType(StringType()))
clean_text = tokenized.withColumn("results", udf_cleantext(struct([tokenized[x] for x in tokenized.columns])))

In [7]:
clean_text.show(5)

+---+--------------------+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|             results|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Pallucci is a qua...|[pallucci, is, a,...|[pallucci, quaint...|[pallucci, quaint...|
|  1|Came for dinner, ...|[came, for, dinne...|[came, dinner,, $...|[came, dinner, 70...|
|  2|What a gem!!  My ...|[what, a, gem!!, ...|[gem!!, , husband...|[gem, , husband, ...|
|  3|3.5 stars! I went...|[3.5, stars!, i, ...|[3.5, stars!, wen...|[35, stars, went,...|
|  4|Small, yet cozy r...|[small,, yet, coz...|[small,, yet, coz...|[small, yet, cozy...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [8]:
# properly convert phrases to proper format
nphrases = {}
for s in phrases:
    nphrases[s.keys()[0]] = s.values()[0]

In [9]:
merge = clean_text
merge.show(5)

+---+--------------------+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|             results|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Pallucci is a qua...|[pallucci, is, a,...|[pallucci, quaint...|[pallucci, quaint...|
|  1|Came for dinner, ...|[came, for, dinne...|[came, dinner,, $...|[came, dinner, 70...|
|  2|What a gem!!  My ...|[what, a, gem!!, ...|[gem!!, , husband...|[gem, , husband, ...|
|  3|3.5 stars! I went...|[3.5, stars!, i, ...|[3.5, stars!, wen...|[35, stars, went,...|
|  4|Small, yet cozy r...|[small,, yet, coz...|[small,, yet, coz...|[small, yet, cozy...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
def words2one(text):
    list_of_words = text[4]
    results = list_of_words
    idx = 0
    num_of_change=0
    for i, w in enumerate(list_of_words):
        idx = i-num_of_change
        if i==len(list_of_words)-1:
                continue
        elif w in nphrases and nphrases[w]==list_of_words[i+1]:
            results[idx] = results[idx]+"_"+results[idx+1]
            results = results[:idx+1] + results[idx+2:]
            num_of_change += 1
    if num_of_change==0:
        return []
    return results

udf_convert = udf(words2one, ArrayType(StringType()))
ctext = merge.withColumn("converted", udf_convert(struct([merge[y] for y in merge.columns])))
ctext.select("converted").show(5)

+--------------------+
|           converted|
+--------------------+
|[pallucci, quaint...|
|                  []|
|[gem, , husband, ...|
|[35, stars, went,...|
|[small, yet, cozy...|
+--------------------+
only showing top 5 rows



In [11]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="converted", outputCol="vectors")
model = word2Vec.setNumPartitions(10).fit(ctext.select("converted"))

In [12]:
list_of_phrases = []
for s in nphrases.items():
    list_of_phrases.append(s[0] + "_" + s[1])

In [13]:
print model.getVectors().columns

['word', 'vector']


In [14]:
domain = model.getVectors().where(col("word").isin(list_of_phrases))
domain.show(5)

+--------------+--------------------+
|          word|              vector|
+--------------+--------------------+
|    thats_fine|[-0.8186696171760...|
|      long_for|[0.02426917850971...|
|elbow_macaroni|[0.21944025158882...|
|  pickle_chips|[0.68999654054641...|
|       ging_es|[1.95766067504882...|
+--------------+--------------------+
only showing top 5 rows



In [None]:
def _round(text):
    vectors = text[1]
    for v in enumerate(vectors):
        v = round(v,5)
    return vectors

rdomain = domain
udf_round = udf(_round, ArrayType(StringType()))
rdomain = rdomain.withColumn("round", udf_round(struct([rdomain[z] for z in rdomain.columns])))
rdomain.show(5)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
domain.cache()
kmeans = KMeans(featuresCol="vector", predictionCol="prediction", k=3, seed=50)
kmodel = kmeans.fit(domain)
# #kmeans = KMeans().setK(3).setSeed(50)
# #model = kmeans.fit(domain.select('vector'))

In [17]:
#model.getVectors().where(col("word").isin(list_of_phrases)).toPandas().to_csv('cuisine_sim.csv')