In [1]:
import findspark
findspark.init()
import pyspark
import random
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col
import pyspark.sql.functions
from pyspark.sql.functions import sum


In [2]:
spark = SparkSession.builder.master("local").appName("Yelp").getOrCreate()
sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)

In [4]:
review_df = sqlContext.read.json("yelp_academic_dataset_review.json")
business_df = sqlContext.read.json("yelp_academic_dataset_business.json")

In [5]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import LSTM
from keras import backend as K

import tensorflow
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing import text as txt

tk = txt.Tokenizer(split= " ") 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
business_Reviews = review_df.select("business_id", "text", "stars")
from pyspark.sql.functions import col, asc, when
business_Reviews = business_Reviews.orderBy(asc("business_id"))

In [7]:
business_df2 = business_df.withColumn("review_count", business_df["review_count"].cast("double"))
business_review_count = business_df2.filter(col('city').isin(['Tempe'])).groupBy("business_id").\
agg(sum("review_count").alias("rev_count"))
business_gt5 = business_review_count.filter("rev_count >= 5")
right = business_gt5.select("business_id")

In [8]:
tempeBusinessReviews = business_Reviews.join(right, "business_id")
tempeBusinessReviews_1_2_5 = tempeBusinessReviews.filter("stars = 1 or stars = 2 or stars = 5")
tempeBusinessReviews_Sentiment = tempeBusinessReviews_1_2_5.\
withColumn("sentiment", when(col("stars") == "1", "0"). when(col("stars") == "2", "0"). when(col("stars") == "5", "1"))

In [9]:
tempeBusinessReviews_Sentiment.show(5)

+--------------------+--------------------+-----+---------+
|         business_id|                text|stars|sentiment|
+--------------------+--------------------+-----+---------+
|--9QQLMTbFzLJ_oT-...|I've been going t...|    5|        1|
|--9QQLMTbFzLJ_oT-...|Don't ever believ...|    1|        0|
|--9QQLMTbFzLJ_oT-...|Always great cust...|    5|        1|
|--9QQLMTbFzLJ_oT-...|Haircut was good ...|    1|        0|
|--9QQLMTbFzLJ_oT-...|If you want a man...|    5|        1|
+--------------------+--------------------+-----+---------+
only showing top 5 rows



In [10]:
k = tempeBusinessReviews_Sentiment.select('text').rdd.map(lambda x: x[0].encode('utf-8').decode()).collect()

In [11]:
tk = txt.Tokenizer(split= " ")   #tokenizes  
print(k[0])



I've been going to this particular location for several years.  I always go to the same stylist and she always does a great job.  Someone I can rely on.   I'm not saying who she is because she's busy enough.  I will only say I moved and still go there, even though I'm much farther away and pass many other Great Clips along the way. If you use a coupon then it's really a good price.  But be sure and tip the same as you normally would if you were paying full price.


In [12]:
tk.fit_on_texts(k)


In [13]:
x = tk.texts_to_sequences(k) 

In [14]:
y = tempeBusinessReviews_Sentiment.select('sentiment').rdd.map(lambda x: int(x[0])).collect()

In [15]:
max_features = 20000   # the more the better
max_length = 500  # cut texts after this number of words (arbitray)

In [16]:
x = sequence.pad_sequences(x, maxlen = max_length, padding = 'post')

In [17]:
x[1]

array([   81,   132,   559,     1,   229,    12,    38,   728,    80,
          18,     5,   338,   897,   164,    73,    17,  1604,    77,
           1,  1416,    42,   197,    17, 34768,     2, 19063,    25,
          40,    17,   143,     4,   201,    50,     5,  1688,    59,
          50,    65,    27,    65,   284, 34769,   310,   115,    42,
           1,  2218,   137,     4,  1616,     5,  1610,    10,  4937,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [18]:
from sklearn.model_selection import train_test_split


In [19]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(len(X_train),len(y_train),len(X_test),len(y_test))

100577 100577 25145 25145


In [20]:
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten


In [21]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(100000, embedding_vector_length, input_length = max_length))
model.add(Convolution1D(nb_filter=32, filter_length=4, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=4))       # pooling (max) after convoluting
model.add(Flatten())
model.add(Dense(500, activation='relu'))     # relu and sigmoid
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           3200000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           4128      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 125, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               2000500   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 501       
Total params: 5,205,129
Trainable params: 5,205,129
Non-trainable params: 0
_________________________________________________________________


  after removing the cwd from sys.path.
  """


In [22]:
y_train[0]

1

In [23]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=32, verbose=1)   # batch size powers of two
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

  """Entry point for launching an IPython kernel.


Train on 100577 samples, validate on 25145 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 96.38%


In [24]:
import pickle

In [25]:
pickle.dump(model, open("SentimentAnalysisModel", 'wb'))

In [26]:
loaded_model = pickle.load(open("SentimentAnalysisModel", 'rb'))

In [27]:
scores = loaded_model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 96.38%
