In [1]:
from sagemaker import get_execution_role

bucket_name = 'elen6889'

role = get_execution_role()
bucket_key_prefix = 'spoiler-classifier'
vocabulary_length = 9013

print(role)

arn:aws:iam::176363299110:role/service-role/AmazonSageMaker-ExecutionRole-20220425T104815


In [2]:
!mkdir -p dataset

In [3]:
!unzip -o dataset/archive.zip -d dataset

Archive:  dataset/archive.zip
  inflating: dataset/IMDB_movie_details.json  
  inflating: dataset/IMDB_reviews.json  


In [25]:
import pandas as pd
import numpy as np
import pickle
from classifier_utilities import one_hot_encode
from classifier_utilities import vectorize_sequences
#df_reviews = pd.read_json('dataset/IMDB_reviews.json', lines=True)
df_review = pd.read_csv('dataset/df.csv')
#print('User reviews shape: ', df_reviews.shape)
#df[df.columns[0]] = df[df.columns[0]].map({'True': 0, 'False': 1})
df = pd.DataFrame()
df['is_spoiler'] = df_review['is_spoiler']
df['text'] = df_review['text']

In [26]:
df[df.columns[0]] = df[df.columns[0]].map({False: 0, True: 1})

In [55]:
df = df.sample(frac = 1)

In [56]:
df.head(3)

Unnamed: 0,is_spoiler,text
406,0,What constitutes horror? Many people say it's...
167,0,This is one of only two movies I can recall wa...
63,0,Yet another dry humorred spoof of a documentar...


In [57]:
targets = df[df.columns[0]].values
messages = df[df.columns[1]].values

# one hot encoding for each SMS message
one_hot_data = one_hot_encode(messages, vocabulary_length)
encoded_messages = vectorize_sequences(one_hot_data, vocabulary_length)

In [58]:
messages[-1]

'This movie has sarcasm, lots and lots of humour, great dialogue, pathos, brilliant acting and senes of things blowing up all rolled into one ace movie! Kirsten Dunst really does shine out in this movie and I think she has so much potential to be a big star if she gets a role in a blockbuster movie.  The humour in this movie is great and I love the scenes where we see how the cast die and especially my favourite scene where Denise Richards dances with the Jesus doll and you people who thought that was blasphemy, get a life it was just harmless fun.  Whoever wrote this script is a genius and has loads of great one off lines so listen out for them. Rating 7.5/10'

In [59]:
print(encoded_messages)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [60]:
df2 = pd.DataFrame(encoded_messages)
df2.insert(0, 'spam', targets)

# Split into training and validation sets (80%/20% split)
split_index = int(np.ceil(df.shape[0] * 0.8))
train_set = df2[:split_index]
val_set = df2[split_index:]

In [61]:
train_set.to_csv('dataset/sms_train_set.gz', header=False, index=False, compression='gzip')
val_set.to_csv('dataset/sms_val_set.gz', header=False, index=False, compression='gzip')

In [62]:
import boto3

s3 = boto3.resource('s3')
target_bucket = s3.Bucket(bucket_name)

with open('dataset/sms_train_set.gz', 'rb') as data:
    target_bucket.upload_fileobj(data, '{0}/train/sms_train_set.gz'.format(bucket_key_prefix))
    
with open('dataset/sms_val_set.gz', 'rb') as data:
    target_bucket.upload_fileobj(data, '{0}/val/sms_val_set.gz'.format(bucket_key_prefix))



In [63]:
!cat 'sms_spam_classifier_mxnet_script.py'

from __future__ import print_function

import logging
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn
import numpy as np
import json
import time

import pip

try:
    from pip import main as pipmain
except:
    from pip._internal import main as pipmain

pipmain(['install', 'pandas'])
import pandas

#logging.basicConfig(level=logging.DEBUG)

# ------------------------------------------------------------ #
# Training methods                                             #
# ------------------------------------------------------------ #


def train(hyperparameters, input_data_config, channel_input_dirs, output_data_dir,
          num_gpus, num_cpus, hosts, current_host, **kwargs):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.
    ctx = mx.cpu()

    # retrieve the hyperparameters and apply some defaults

In [64]:
from sagemaker.mxnet import MXNet

output_path = 's3://{0}/{1}/output'.format(bucket_name, bucket_key_prefix)
code_location = 's3://{0}/{1}/code'.format(bucket_name, bucket_key_prefix)

m = MXNet('sms_spam_classifier_mxnet_script.py',
          role=role,
          train_instance_count=1,
          instance_type='ml.c5.2xlarge',
          output_path=output_path,
          base_job_name='sms-spam-classifier-mxnet',
          framework_version="1.2",
          py_version="py3",
          code_location = code_location,
          hyperparameters={'batch_size': 100,
                         'epochs': 20,
                         'learning_rate': 0.01})

inputs = {'train': 's3://{0}/{1}/train/'.format(bucket_name, bucket_key_prefix),
 'val': 's3://{0}/{1}/val/'.format(bucket_name, bucket_key_prefix)}

m.fit(inputs)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2022-04-25 19:32:00 Starting - Starting the training job...
2022-04-25 19:32:24 Starting - Preparing the instances for trainingProfilerReport-1650915119: InProgress
......
2022-04-25 19:33:27 Downloading - Downloading input data...
2022-04-25 19:33:48 Training - Training image download completed. Training in progress.[34m2022-04-25 19:33:49,824 INFO - root - running container entrypoint[0m
[34m2022-04-25 19:33:49,824 INFO - root - starting train task[0m
[34m2022-04-25 19:33:49,828 INFO - container_support.training - Training starting[0m
[34m2022-04-25 19:33:50,664 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'model_dir': '/opt/ml/model', 'output_data_dir': '/opt/ml/output/data/', '_scheduler_host': 'algo-1', 'container_log_level': 20, 'output_dir': '/opt/ml/output', 'current_host': 'algo-1', 'channel_dirs': {'train': '/opt/ml/input/data/train', 'val': '/opt/ml/input/data/val'}, 'channels': {'train': {'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicate

In [65]:
mxnet_pred = m.deploy(initial_instance_count=1,
                      instance_type='ml.m5.large')

-----!

In [68]:
from sagemaker.mxnet.model import MXNetPredictor
from classifier_utilities import one_hot_encode
from classifier_utilities import vectorize_sequences

# Uncomment the following line to connect to an existing endpoint.
#mxnet_pred = MXNetPredictor('classifier-mxnet-2022-04-25-18-34-12-620')

test_messages = ["Along with Q and Pulp Fiction, Frank Darabont (The Walking Dead original writer) and his debut film got screwed by the Hollywood Powerhouse that is Tom Hanks with this now can be considered an American Classic. The fact that this film barely and I emphasize barely made it's money back in US theaters (which is usually a signal for a film to turn into a DOA when released at home) and it made such a splash,"]
one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)

print(encoded_test_messages)

result = mxnet_pred.predict(encoded_test_messages)
print(result)

[[0. 0. 0. ... 0. 0. 0.]]
{'predicted_label': [[0.0]], 'predicted_probability': [[0.003053836291655898]]}
