In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.13 (default, Dec 20 2016 23:05:08)
SparkSession available as 'spark'.


In [99]:
import csv
phrases = []
phrase2frq = {}
cuisine_name = "Mexican"
with open('/Users/lakerwayne/Desktop/YelpChallenge/salient.csv', 'rb') as csvfile:
    wordsReader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in wordsReader:
        stat = row[0].split(',')
        words = stat[0].split('_')
        if len(words) < 3:
            phrases.append(stat[0])
            phrase2frq[stat[0]] = 0

In [102]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf

folder_path = '/Users/lakerwayne/Desktop/YelpChallenge/spark/'
cuisine_path = "/Users/lakerwayne/Desktop/YelpChallenge/cuisines/review_" + cuisine_name + ".txt"
reviews = []

with open(cuisine_path, 'r') as txtfile:
    rid = 0
    for line in txtfile.readlines():
        review = tuple([rid, line])
        reviews.append(review)
        rid += 1

sentenceDataFrame = spark.createDataFrame(reviews, ["id", "review"])
tokenizer = Tokenizer(inputCol="review", outputCol="words")
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.show(5)

+---+--------------------+--------------------+
| id|              review|               words|
+---+--------------------+--------------------+
|  0|I was desperately...|[i, was, desperat...|
|  1|This is a locally...|[this, is, a, loc...|
|  2|Maria was super n...|[maria, was, supe...|
|  3|Yes, sometimes th...|[yes,, sometimes,...|
|  4|The only reason t...|[the, only, reaso...|
+---+--------------------+--------------------+
only showing top 5 rows



In [103]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
tokenized = remover.transform(tokenized)

In [104]:
tokenized.show(5)

+---+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|
+---+--------------------+--------------------+--------------------+
|  0|I was desperately...|[i, was, desperat...|[desperately, nee...|
|  1|This is a locally...|[this, is, a, loc...|[locally, owned, ...|
|  2|Maria was super n...|[maria, was, supe...|[maria, super, ni...|
|  3|Yes, sometimes th...|[yes,, sometimes,...|[yes,, sometimes,...|
|  4|The only reason t...|[the, only, reaso...|[reason, even, be...|
+---+--------------------+--------------------+--------------------+
only showing top 5 rows



In [106]:
from pyspark.sql.functions import struct
from pyspark.sql.types import *
import re

def cleanup_text(record):
    text = record[3]
    text_out = [re.sub('[^a-zA-Z0-9]','',word) for word in text]
    return text_out

# define udf with an array of tokenized words
udf_cleantext = udf(cleanup_text , ArrayType(StringType()))
clean_text = tokenized.withColumn("results", udf_cleantext(struct([tokenized[x] for x in tokenized.columns])))

In [107]:
clean_text.show(5)

+---+--------------------+--------------------+--------------------+--------------------+
| id|              review|               words|            filtered|             results|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|I was desperately...|[i, was, desperat...|[desperately, nee...|[desperately, nee...|
|  1|This is a locally...|[this, is, a, loc...|[locally, owned, ...|[locally, owned, ...|
|  2|Maria was super n...|[maria, was, supe...|[maria, super, ni...|[maria, super, ni...|
|  3|Yes, sometimes th...|[yes,, sometimes,...|[yes,, sometimes,...|[yes, sometimes, ...|
|  4|The only reason t...|[the, only, reaso...|[reason, even, be...|[reason, even, be...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [119]:
merge = clean_text

In [120]:
def words2one(text):
    list_of_words = text[4]
    results = list_of_words
    idx = 0
    num_of_change=0
    for i, w in enumerate(list_of_words):
        idx = i-num_of_change
        if i==len(list_of_words)-1:
                continue
        combined_word = results[idx]+"_"+results[idx+1]
        if combined_word in phrase2frq:
            phrase2frq[combined_word] += 1
            results[idx] = combined_word
            results = results[:idx+1] + results[idx+2:]
            num_of_change += 1
    if num_of_change==0:
        return []
    return results

udf_convert = udf(words2one, ArrayType(StringType()))
ctext = merge.withColumn("converted", udf_convert(struct([merge[y] for y in merge.columns])))
ctext.select("converted").show(5)

+--------------------+
|           converted|
+--------------------+
|[desperately, nee...|
|[locally_owned, m...|
|[maria, super_nic...|
|[yes, sometimes, ...|
|[reason, even, be...|
+--------------------+
only showing top 5 rows



In [122]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="converted", outputCol="vectors")
model = word2Vec.setNumPartitions(10).fit(ctext.select("converted"))

In [130]:
model.getVectors().where(col("word").isin(phrases)).toPandas().to_csv(folder_path + cusine_name + '.csv')

In [133]:
round_phrases = {}
with open('/Users/lakerwayne/Desktop/YelpChallenge/spark/Mexican_sim.csv', 'rb') as csvfile:
    creader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    first_row = True
    for row in creader:
        if first_row:
            first_row = False
            continue
        stats = []
        cols = row[0].split(',')
        rest = cols[1]
        # iterate over stats from i=2
        for i in range(2,len(cols)):
            if i==2:
                vec = round(float(cols[i][2:])*1000.0)/1000
            elif i==len(cols)-1:
                vec = round(float(cols[i][:-2])*1000.0)/1000
            else:
                vec = round(float(cols[i])*1000.0)/1000
            stats.append(vec)
        round_phrases[rest] = stats

In [135]:
#from pyspark.mllib.clustering import KMeans
from numpy import array
rname = []
data = []
for p in round_phrases.keys():
    rname.append(p)
    data.append(round_phrases[p])

In [138]:
data = array(data).reshape(len(rname), 100)
print data

[[ 0.162  0.082 -0.142 ..., -0.168 -0.016 -0.139]
 [ 0.803 -0.065  0.832 ...,  0.302  0.035  0.654]
 [ 0.915  2.055 -0.903 ..., -1.734  1.159 -2.014]
 ..., 
 [-0.347  0.795 -0.306 ...,  0.68   0.375  1.041]
 [-0.392  0.437 -0.121 ...,  1.127  0.286  1.455]
 [ 0.221 -0.223  0.286 ..., -0.049  0.076 -0.081]]


In [139]:
from __future__ import print_function
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import KMeans
import numpy as np

range_n_clusters = range(2,10)
silhlist = {}

for n_clusters in range_n_clusters:
    
    model = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_ind = model.fit_predict(data)
    
    silhouette_avg = silhouette_score(data, cluster_ind)
    silhlist[n_clusters] = silhouette_avg
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

k = max(silhlist, key=silhlist.get)

For n_clusters = 2 The average silhouette_score is : 0.273847246381
For n_clusters = 3 The average silhouette_score is : 0.258930765624
For n_clusters = 4 The average silhouette_score is : 0.260479562336
For n_clusters = 5 The average silhouette_score is : 0.271219948238
For n_clusters = 6 The average silhouette_score is : 0.274052025562
For n_clusters = 7 The average silhouette_score is : 0.16183022316
For n_clusters = 8 The average silhouette_score is : 0.151338495823
For n_clusters = 9 The average silhouette_score is : 0.150432670438
For n_clusters = 10 The average silhouette_score is : 0.142594733024
For n_clusters = 11 The average silhouette_score is : 0.140660278944
For n_clusters = 12 The average silhouette_score is : 0.103739604283
For n_clusters = 13 The average silhouette_score is : 0.134331786764
For n_clusters = 14 The average silhouette_score is : 0.134657125555
For n_clusters = 15 The average silhouette_score is : 0.135408102742
For n_clusters = 16 The average silhouette_

In [140]:
k = 4
clusterer = KMeans(n_clusters=k, random_state=10)
cluster_label = clusterer.fit_predict(data)

In [142]:
group_by_label = {label:[] for label in range(k)}
cluster_label = cluster_label.tolist()
for i in range(len(rname)):
    group_by_label[cluster_label[i]].append(rname[i])

In [148]:
for i in range(k):
    with open("/Users/lakerwayne/Desktop/YelpChallenge/spark/Mexican" + str(i) + ".txt", "w") as txtfile:
        for c in group_by_label[i]:
            txtfile.write(c.encode('ascii', 'ignore'))
            txtfile.write('\n')