## Blazing Text Classification for Yelp

### Import

In [None]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'blazingtext_yelp_review/supervised' 

In [4]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
!pwd

/root/fake_review_detection_modeling/yelp_review_blazing_text


In [8]:
yelp_review_pd = pd.read_excel('../data/Yelp Labelled Review Dataset with Sentiments and Features.xlsx')

In [9]:
yelp_review_pd.head()

Unnamed: 0,User_id,Product_id,Rating,Date,Review,Spam(1) and Not Spam(0),Sentiment,Features
0,923,0,3,2014-01-30,The food at snack is a selection of popular Gr...,1,Positive,"['appetizer tray', 'greek salad', 'main courses']"
1,924,0,3,2011-05-05,This little place in Soho is wonderful. I had ...,1,Positive,"['little place', 'soho', 'lamb sandwich', 'soh..."
2,925,0,4,2011-12-30,ordered lunch for 15 from Snack last Friday. Ã...,1,Positive,"['snack', 'regular company lunch list']"
3,926,0,4,2012-10-04,This is a beautiful quaint little restaurant o...,1,Positive,"['beautiful quaint', 'pretty street', 'great p..."
4,927,0,4,2014-02-06,Snack is great place for a Ã‚Â casual sit down...,1,Positive,"['snack', 'great place', 'Ã¢ casual', 'cold wi..."


In [10]:
yelp_review_pd_clean = yelp_review_pd[['Review','Spam(1) and Not Spam(0)']]

In [11]:
yelp_review_pd_clean.head()

Unnamed: 0,Review,Spam(1) and Not Spam(0)
0,The food at snack is a selection of popular Gr...,1
1,This little place in Soho is wonderful. I had ...,1
2,ordered lunch for 15 from Snack last Friday. Ã...,1
3,This is a beautiful quaint little restaurant o...,1
4,Snack is great place for a Ã‚Â casual sit down...,1


In [12]:
yelp_review_train, yelp_review_test = train_test_split(yelp_review_pd_clean,test_size=0.2)

In [13]:
yelp_review_train.head()

Unnamed: 0,Review,Spam(1) and Not Spam(0)
55797,My favorite brunch place in the Soho area. Sma...,0
282437,"The best southern food in NYC, hands down. I a...",0
284378,After seeing all the great reviews about this ...,0
137417,"If you go, sit in the back garden. It feels li...",0
269443,Quite possibly one of my newest favorite place...,0


In [14]:
len(yelp_review_train)

284168

In [15]:
yelp_review_train['Spam(1) and Not Spam(0)'].astype('str').describe()

count     284168
unique         2
top            0
freq      255278
Name: Spam(1) and Not Spam(0), dtype: object

In [16]:
yelp_review_train.to_csv('train.csv',header=None,index=False)

In [17]:
!head train.csv

"My favorite brunch place in the Soho area. Small locale so get there early and enjoy a nice cup of coffee while you wait. Their coffee is delicious, as well as the hot and toasty French bread which is honestly my favorite part of the dining experience. For main dishes, I recommend the Tuna Tartar, French Toast, Pancakes, and Creme Souffle dishes. Absolutely wonderful staff. Ã‚Â The chef was a little late in the morning, so they gave us free mimosas to pass the time. Definitely worth the price!",0
"The best southern food in NYC, hands down. I always get the Reggie Harris. Ã‚Â It is crispy honey glazed chicken which reminds me of Korean fried style chicken but in a southern format. WUT!? YES. The mac-n-cheese is wonderful and so are the greens. Eat up!",0
"After seeing all the great reviews about this place, I really have to say I kind of expected more. The setting is very low-key (that's an understatement), and I'd even say divey in nature. Then I thought, maybe this means the food wil

In [18]:
yelp_review_test.head()

Unnamed: 0,Review,Spam(1) and Not Spam(0)
41675,The owner is a bee-yotch. Rudely informed me t...,0
218653,My other local bar in BK that isn't Zombie Hut...,0
308072,The tiny bathroom had trouble getting in out,0
309717,Rooms were just too small Once suitcase was i...,0
268181,Nice atmosphere and friendly staff. The food w...,1


In [19]:
len(yelp_review_test)

71042

In [20]:
yelp_review_test['Spam(1) and Not Spam(0)'].astype('str').describe()

count     71042
unique        2
top           0
freq      63799
Name: Spam(1) and Not Spam(0), dtype: object

In [21]:
yelp_review_test.to_csv('test.csv',header=None,index=False)

In [22]:
index_to_label = {'1':'fake','0':'truth'}

In [23]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[1]]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[0].lower()))
    return cur_row

In [24]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [None]:
%%time

# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('train.csv', 'yelp_review.train', keep=.5)
        
# Preparing the validation dataset        
preprocess('test.csv', 'yelp_review.validation')

In [26]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='yelp_review.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='yelp_review.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 332 ms, sys: 154 ms, total: 486 ms
Wall time: 921 ms


In [27]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

### Training

In [28]:
region_name = boto3.Session().region_name

In [29]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


Using SageMaker BlazingText container: 475088953585.dkr.ecr.ap-southeast-1.amazonaws.com/blazingtext:1 (ap-southeast-1)


### Training the BlazingText model for supervised text classification

In [30]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         instance_count=1, 
                                         instance_type='ml.c4.4xlarge',
                                         volume_size = 30,
                                         max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         hyperparameters = {
                                           "mode":"supervised",
                                           "epochs":1,
                                            "min_count":2,
                                            "learning_rate": 0.05,
                                            "vector_dim":10,
                                            "early_stopping":True,
                                            "patience":4,
                                            "min_epochs":5,
                                            "word_ngrams":2
                                         })

In [31]:
train_data = sagemaker.inputs.TrainingInput(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.inputs.TrainingInput(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [32]:
bt_model.fit(inputs=data_channels, logs=True)

2021-03-27 10:19:11 Starting - Starting the training job...
2021-03-27 10:19:15 Starting - Launching requested ML instancesProfilerReport-1616840351: InProgress
......
2021-03-27 10:20:41 Starting - Preparing the instances for training......
2021-03-27 10:21:35 Downloading - Downloading input data
2021-03-27 10:21:35 Training - Downloading the training image..[34mArguments: train[0m
[34m[03/27/2021 10:21:50 INFO 139806527149440] nvidia-smi took: 0.025188922882080078 secs to identify 0 gpus[0m
[34m[03/27/2021 10:21:50 INFO 139806527149440] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[03/27/2021 10:21:50 INFO 139806527149440] Processing /opt/ml/input/data/train/yelp_review.train . File size: 3.2117605209350586 MB[0m
[34m[03/27/2021 10:21:50 INFO 139806527149440] Processing /opt/ml/input/data/validation/yelp_review.validation . File size: 40.13163757324219 MB[0m
[34mRead 0M words[0m
[34mN

### Hosting / Inference

In [45]:
from sagemaker.serializers import JSONSerializer

text_classifier = bt_model.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    endpoint_name = 'blazingtext-fake-review',
    serializer = JSONSerializer()
)

-------------!

In [36]:
sentences = ["Will not return! Food: nothing impressive, no matter for a single dish or the whole experience. No wonder it\'s downgraded from 2 star to 1 star by Michelin, I think Bouley lacks that \"OMG\" dish and failed to create an experience. Service is the worst among top restaurants, well, since now it\'s in the sea of 1 stars, maybe my comparison of it to Le Bernardin is not fair. The servers do not dress well, do not have the professional attitude, and are not well organized, they are not masters of their work. Compare to being a princess in Le Bernardin or Jean-Georges, you feel like, well, a normal person. OMG, the restroom is so odd and cold. There is no way you can power yourself using those mirrors and under that lighting. Go check it out yourself. My first time and last time there."]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "label": [
      "__label__truth"
    ],
    "prob": [
      0.7097575068473816
    ]
  }
]


In [37]:
payload = {"instances" : tokenized_sentences,
          "configuration": {"k": 2}}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "label": [
      "__label__truth",
      "__label__fake"
    ],
    "prob": [
      0.7097575068473816,
      0.29026246070861816
    ]
  }
]


In [38]:
#sess.delete_endpoint(text_classifier.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [39]:
text_classifier

<sagemaker.predictor.Predictor at 0x7f56068cbf10>

In [40]:
tokenized_sentences

["Will not return ! Food : nothing impressive , no matter for a single dish or the whole experience . No wonder it 's downgraded from 2 star to 1 star by Michelin , I think Bouley lacks that `` OMG '' dish and failed to create an experience . Service is the worst among top restaurants , well , since now it 's in the sea of 1 stars , maybe my comparison of it to Le Bernardin is not fair . The servers do not dress well , do not have the professional attitude , and are not well organized , they are not masters of their work . Compare to being a princess in Le Bernardin or Jean-Georges , you feel like , well , a normal person . OMG , the restroom is so odd and cold . There is no way you can power yourself using those mirrors and under that lighting . Go check it out yourself . My first time and last time there ."]

In [41]:
sentences[0].split(" ")

In [42]:
bt_model.deploy?

[0;31mSignature:[0m
[0mbt_model[0m[0;34m.[0m[0mdeploy[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minitial_instance_count[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstance_type[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mserializer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdeserializer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maccelerator_type[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mendpoint_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_compiled_model[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwait[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkms_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_capture_config[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m

In [None]:
yelp_review_test.head()

In [47]:
yelp_review_test.loc[268181]['Review']

"Nice atmosphere and friendly staff. The food was good, not the best burger I've ever had but good nonetheless. Ã‚Â\xa0Prices are very reasonable. Ã‚Â\xa0I love the selection on the brunch menu. Ã‚Â\xa0We'll be going back."

In [48]:
yelp_review_test.loc[309717]['Review']

' Rooms were just too small Once suitcase was in couldn t move anywhere in the room Bathroom was separated from the shower and overall size was just not practical'

In [None]:
sentences = ["Nice atmosphere and friendly staff. The food was good, not the best burger I've ever had but good nonetheless. Prices are very reasonable. I love the selection on the brunch menu. We'll be going back."]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))