# Yelp data prep
https://www.yelp.com/dataset/download

## Moving files around

In [1]:
!mkdir -p /temp

In [2]:
# !gsutil cp gs://msca-bdp-data/yelp/yelp_dataset.tgz /temp

In [3]:
# !tar -C /temp -xvf /temp/yelp_dataset.tgz

In [4]:
# !gsutil cp -n /temp/* gs://msca-bdp-data/yelp/

In [5]:
!gsutil ls -l gs://msca-bdp-data/yelp/

         0  2020-10-17T18:28:38Z  gs://msca-bdp-data/yelp/
     41776  2020-11-21T22:26:22Z  gs://msca-bdp-data/yelp/Dataset_User_Agreement.pdf
   5785348  2020-10-17T18:29:04Z  gs://msca-bdp-data/yelp/Food_Inspections.csv
  13831001  2020-10-17T18:28:35Z  gs://msca-bdp-data/yelp/covid_19_dataset.tgz
 152898689  2020-11-21T22:26:24Z  gs://msca-bdp-data/yelp/yelp_academic_dataset_business.json
 449663480  2020-11-21T22:26:29Z  gs://msca-bdp-data/yelp/yelp_academic_dataset_checkin.json
6325565224  2020-11-21T22:27:47Z  gs://msca-bdp-data/yelp/yelp_academic_dataset_review.json
 263489322  2020-11-21T22:27:51Z  gs://msca-bdp-data/yelp/yelp_academic_dataset_tip.json
3268069927  2020-11-21T22:28:19Z  gs://msca-bdp-data/yelp/yelp_academic_dataset_user.json
4772313040  2020-10-17T18:30:36Z  gs://msca-bdp-data/yelp/yelp_dataset.tgz
7456358400  2020-10-17T18:29:16Z  gs://msca-bdp-data/yelp/yelp_photos.tar
                                 gs://msca-bdp-data/yelp/yelp_model/
TOTAL: 11 objects, 227

## Working on Big Data.  Must use Spark kernel

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import pandas as pd
import numpy as np

In [7]:
# !pip install langdetect

In [8]:
from langdetect import detect

In [9]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [10]:
review = spark.read.json('gs://msca-bdp-data/yelp/yelp_academic_dataset_review.json')

In [11]:
review.count()

8021122

In [12]:
review_sample = review.limit(1000).cache()

In [13]:
review_sample.limit(5)

business_id,cool,date,funny,review_id,stars,text,useful,user_id
HRxCNChQNu7maK29q...,0,2016-12-11 00:18:50,0,wNu8mv1l2Ls_2xu6_...,5.0,Great service and...,0,xOzhzXAfBuKZbTnd5...
9WJd4cqm3ghYA2RxS...,0,2017-03-10 18:43:31,0,lm8EslRY-LFYGVPP2...,3.0,Trying to be too ...,1,z-FYeN6rjBhqwI7ij...
sAmmv3ffj99CN85Wj...,0,2018-02-17 19:06:40,0,Qs1kfCrm8LU068CYq...,5.0,In my initial rev...,0,H3AEm2-QHO60PSEtv...
oiAlXZPIFm2nBCt0D...,0,2016-03-01 21:14:46,0,lD0OOhfrQvzO1upra...,1.0,Great prices..but...,0,gebb5qh_m-in8ZACY...
EVkytEhlC1nswqmgr...,0,2017-09-16 02:07:14,0,9_7320FMHugzvF6Ga...,4.0,虽然给了四星 是因为在店里吃还行 ...,0,LAvfRMWLE7xdxZBEy...


In [14]:
review_sample.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [15]:
review_sample.describe(['stars']).show()

+-------+------------------+
|summary|             stars|
+-------+------------------+
|  count|              1000|
|   mean|             3.894|
| stddev|1.3699053400292454|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+



In [16]:
review_sample.groupby('stars').agg(count('*').alias('record_cnt')).orderBy('stars', ascending=False).show()

+-----+----------+
|stars|record_cnt|
+-----+----------+
|  5.0|       475|
|  4.0|       241|
|  3.0|       105|
|  2.0|        61|
|  1.0|       118|
+-----+----------+



In [17]:
review = review.\
withColumn("sentiment_strength",\
           when(col("stars") == 1, "Strong Negative").\
           when(col("stars") == 2, "Negative").\
           when(col("stars") == 3, "Neutral").\
           when(col("stars") == 4, "Positive").\
           when(col("stars") == 5, "Strong Positive").\
           otherwise("No Sentiment")).\
withColumn("label",\
           when(col("stars") == 1, 0).\
#            when(col("stars") == 2, 0).\
#            when(col("stars") == 3, 2).\
#            when(col("stars") == 4, 1).\
           when(col("stars") == 5, 1).\
           otherwise(3))

In [18]:
review.groupby('label').agg(count('*').alias('record_cnt')).orderBy('label', ascending=False).show()

+-----+----------+
|label|record_cnt|
+-----+----------+
|    3|   3150765|
|    1|   3586460|
|    0|   1283897|
+-----+----------+



In [19]:
trainingData = review.filter(review['label'] < 3).select('text', 'label')

In [20]:
trainingData.limit(5)

text,label
"Ordered delivery,...",0
Be warned! Once y...,0
I love this littl...,1
This guy was grea...,1
Yes yes and yes.....,1


### Balance sample between classes

In [21]:
ratio_adjust = 1.0 ## ratio of pos to neg in the df_subsample


counts = trainingData.groupBy('label').count().collect()

if counts[0][1] > counts[1][1]:
    down_class = counts[0][0]
else:
    down_class = counts[1][0]
    

higherBound = counts[0][1]
treshold_to_filter = int(ratio_adjust * float(counts[1][1]) / counts[0][1] * higherBound)
 
randGen = lambda x: np.random.randint(0, higherBound) if x == down_class else -1
 
udfRandGen = udf(randGen, IntegerType())

trainingData = trainingData.withColumn("randIndex", udfRandGen("label"))
trainingData = trainingData.filter(trainingData['randIndex'] < treshold_to_filter).drop('randIndex')
 
print("Distribution of Pos and Neg cases of the down-sampled training data are: \n", \
      trainingData.groupBy("label").count().take(3))

Distribution of Pos and Neg cases of the down-sampled training data are: 
 [Row(label=1, count=1282962), Row(label=0, count=1283897)]


In [22]:
trainingDataSample = trainingData.sample(False, 0.1, 12345)

In [23]:
trainingDataSample.count()

256417

In [24]:
pd_reviews = trainingDataSample.toPandas()

In [25]:
type(pd_reviews)

pandas.core.frame.DataFrame

In [26]:
# pd_reviews.to_pickle('/temp/yelp_review.pkl')

pd_reviews.to_json('/temp/yelp_review.json', orient='records', lines=True)

In [27]:
pd_reviews.head(5)

Unnamed: 0,text,label
0,I love Deagan's. I do. I really do. The atmosp...,1
1,I love the classes at this gym. Zumba and. Rad...,1
2,The tables and floor were dirty. I was the onl...,0
3,I had an oil change at the 15515 N Scottsdale ...,0
4,The absolute WORST apartment complex I have ev...,0


In [28]:
pd_reviews.shape

(256512, 2)

#### Saving the review DF with star ratings for multi-class classification

In [29]:
review_sample = review.sample(False, 0.05, 12345)
review_sample.count()

401309

In [30]:
pd_stars = review_sample.select('text', 'sentiment_strength').toPandas()

In [32]:
# pd_stars.to_pickle('/temp/yelp_review_stars.pkl')

pd_stars.to_json('/temp/yelp_review_stars.json', orient='records', lines=True)

In [33]:
pd_stars.head(5)

Unnamed: 0,text,sentiment_strength
0,I am actually horrified this place is still in...,Strong Negative
1,Maria is VERY good at what she does with great...,Storng Positive
2,First time eating there and everything was so ...,Storng Positive
3,Had brunch here Sunday with a friend. The meal...,Neutral
4,"Food was piping hot, which is great!! Had the ...",Neutral


In [34]:
pd_stars.shape

(401309, 2)

#### Detect language and limit to English only

In [35]:
def lang_detect(x):
    try:
        return detect(x)
    except:
        return '--'

In [36]:
%time pd_reviews['lang'] = pd_reviews['text'].apply(lambda x: lang_detect(x))

CPU times: user 19min 14s, sys: 9 s, total: 19min 23s
Wall time: 19min 23s


In [37]:
pd_reviews.head(5)

Unnamed: 0,text,label,lang
0,I love Deagan's. I do. I really do. The atmosp...,1,en
1,I love the classes at this gym. Zumba and. Rad...,1,en
2,The tables and floor were dirty. I was the onl...,0,en
3,I had an oil change at the 15515 N Scottsdale ...,0,en
4,The absolute WORST apartment complex I have ev...,0,en


In [38]:
pd_reviews = pd_reviews[pd_reviews['lang']=='en'].reset_index(drop=True)
pd_reviews.shape

(255717, 3)

In [39]:
pd_reviews.to_json('/temp/yelp_train_sentiment.json', orient='records', lines=True)

In [None]:
# !rm /temp/*

#### Make available in "open bucket"

In [3]:
!gsutil cp -r gs://msca-bdp-data/yelp/* gs://msca-bdp-data-open/yelp/

Copying gs://msca-bdp-data/yelp/Dataset_User_Agreement.pdf [Content-Type=application/pdf]...
Copying gs://msca-bdp-data/yelp/Food_Inspections.csv [Content-Type=text/csv]... 
Copying gs://msca-bdp-data/yelp/covid_19_dataset.tgz [Content-Type=application/x-compressed]...
/ [3 files][ 18.8 MiB/ 18.8 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://msca-bdp-data/yelp/yelp_academic_dataset_business.json [Content-Type=application/json]...
Copying gs://msca-bdp-data/yelp/yelp_academic_dataset_checkin.json [Content-Type=application/json]...
Copying gs://msca-bdp-data/yelp/yelp_academic_dataset_review.json [Content-Type=application/json]...
Copying gs://msca-bdp-data/yelp/yelp_academic_dataset_tip.json [Content-Type=application/js

In [1]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Sat, 28 November 2020 15:28:56'