In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.13 (default, Dec 20 2016 23:05:08)
SparkSession available as 'spark'.


In [3]:
import csv
phrases = []
with open('/Users/lakerwayne/Desktop/YelpChallenge/American_new_salient.csv', 'rb') as csvfile:
    wordsReader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in wordsReader:
        stat = row[0].split(',')
        if stat[1] >= 0.8:
            words = stat[0].split('_')
            word = {words[0]: words[1]}
            phrases.append(word)

In [6]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf

cuisine_path = "/Users/lakerwayne/Desktop/YelpChallenge/cuisines/review_American_new.txt"
reviews = []

with open(cuisine_path, 'r') as txtfile:
    rid = 0
    for line in txtfile.readlines():
        review = tuple([rid, line])
        reviews.append(review)
        rid += 1

sentenceDataFrame = spark.createDataFrame(reviews, ["id", "review"])
tokenizer = Tokenizer(inputCol="review", outputCol="words")
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.show(5)


+---+--------------------+--------------------+
| id|              review|               words|
+---+--------------------+--------------------+
|  0|I was new to Toro...|[i, was, new, to,...|
|  1|I have a confessi...|[i, have, a, conf...|
|  2|If I would rather...|[if, i, would, ra...|
|  3|Once in a while t...|[once, in, a, whi...|
|  4|We can't stop her...|[we, can't, stop,...|
+---+--------------------+--------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
tokenized = remover.transform(tokenized)

In [8]:
tokenized.show(5)

+---+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|
+---+--------------------+--------------------+--------------------+
|  0|I was new to Toro...|[i, was, new, to,...|[new, toronto, vi...|
|  1|I have a confessi...|[i, have, a, conf...|[confession, make...|
|  2|If I would rather...|[if, i, would, ra...|[would, rather, r...|
|  3|Once in a while t...|[once, in, a, whi...|[girls, feel, nee...|
|  4|We can't stop her...|[we, can't, stop,...|[can't, stop, her...|
+---+--------------------+--------------------+--------------------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import struct
from pyspark.sql.types import *
import re

def cleanup_text(record):
    text = record[3]
    text_out = [re.sub('[^a-zA-Z0-9]','',word) for word in text]
    return text_out

# define udf with an array of tokenized words
udf_cleantext = udf(cleanup_text , ArrayType(StringType()))
clean_text = tokenized.withColumn("results", udf_cleantext(struct([tokenized[x] for x in tokenized.columns])))

In [10]:
clean_text.show(5)

+---+--------------------+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|             results|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|I was new to Toro...|[i, was, new, to,...|[new, toronto, vi...|[new, toronto, vi...|
|  1|I have a confessi...|[i, have, a, conf...|[confession, make...|[confession, make...|
|  2|If I would rather...|[if, i, would, ra...|[would, rather, r...|[would, rather, r...|
|  3|Once in a while t...|[once, in, a, whi...|[girls, feel, nee...|[girls, feel, nee...|
|  4|We can't stop her...|[we, can't, stop,...|[can't, stop, her...|[cant, stop, here...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [11]:
# properly convert phrases to proper format
nphrases = {}
for s in phrases:
    nphrases[s.keys()[0]] = s.values()[0]

In [12]:
merge = clean_text
merge.show(5)

+---+--------------------+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|             results|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|I was new to Toro...|[i, was, new, to,...|[new, toronto, vi...|[new, toronto, vi...|
|  1|I have a confessi...|[i, have, a, conf...|[confession, make...|[confession, make...|
|  2|If I would rather...|[if, i, would, ra...|[would, rather, r...|[would, rather, r...|
|  3|Once in a while t...|[once, in, a, whi...|[girls, feel, nee...|[girls, feel, nee...|
|  4|We can't stop her...|[we, can't, stop,...|[can't, stop, her...|[cant, stop, here...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [13]:
def words2one(text):
    list_of_words = text[4]
    results = list_of_words
    idx = 0
    num_of_change=0
    for i, w in enumerate(list_of_words):
        idx = i-num_of_change
        if i==len(list_of_words)-1:
                continue
        elif w in nphrases and nphrases[w]==list_of_words[i+1]:
            results[idx] = results[idx]+"_"+results[idx+1]
            results = results[:idx+1] + results[idx+2:]
            num_of_change += 1
    if num_of_change==0:
        return []
    return results

udf_convert = udf(words2one, ArrayType(StringType()))
ctext = merge.withColumn("converted", udf_convert(struct([merge[y] for y in merge.columns])))
ctext.select("converted").show(5)

+--------------------+
|           converted|
+--------------------+
|[new, toronto, vi...|
|                  []|
|                  []|
|                  []|
|[cant, stop, here...|
+--------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="converted", outputCol="vectors")
model = word2Vec.setNumPartitions(10).fit(ctext.select("converted"))

In [14]:
list_of_phrases = []
for s in nphrases.items():
    list_of_phrases.append(s[0] + "_" + s[1])

In [36]:
print model.getVectors().columns

['word', 'vector']


In [None]:
model.getVectors().where(col("word").isin(list_of_phrases)).toPandas().to_csv('cuisine_sim.csv')