In [1]:
from sagemaker import get_execution_role

bucket_name = 'elen6889'

role = get_execution_role()
bucket_key_prefix = 'spoiler-classifier'
vocabulary_length = 9013

print(role)

arn:aws:iam::176363299110:role/service-role/AmazonSageMaker-ExecutionRole-20220425T104815


In [2]:
!mkdir -p dataset

In [3]:
!unzip -o dataset/archive.zip -d dataset

Archive:  dataset/archive.zip
  inflating: dataset/IMDB_movie_details.json  
  inflating: dataset/IMDB_reviews.json  


In [49]:
import pandas as pd
import numpy as np
import pickle
from classifier_utilities import one_hot_encode
from classifier_utilities import vectorize_sequences
#df_reviews = pd.read_json('dataset/IMDB_reviews.json', lines=True)
df_review = pd.read_csv('dataset/dfnew.csv')
#print('User reviews shape: ', df_reviews.shape)
#df[df.columns[0]] = df[df.columns[0]].map({'True': 0, 'False': 1})
df = pd.DataFrame()
df['is_spoiler'] = df_review['is_spoiler']
df['text'] = df_review['text']

In [50]:
df[df.columns[0]] = df[df.columns[0]].map({False: 0, True: 1})

In [58]:
df = df.sample(frac = 1)

In [59]:
df.head(3)

Unnamed: 0,is_spoiler,text
1289,1,Shawshank has everything you need from a movie...
2376,1,"Freeman, who is simply a great actor, a man wh..."
307,0,RT @chrissvellx: the ending of #MoonKnight epi...


In [60]:
targets = df[df.columns[0]].values
messages = df[df.columns[1]].values

# one hot encoding for each SMS message
one_hot_data = one_hot_encode(messages, vocabulary_length)
encoded_messages = vectorize_sequences(one_hot_data, vocabulary_length)

In [61]:
messages[-1]

'I think this work is very important! I think it will go a long way in the future, so hopefully the project will be… https://t.co/tlrI78Xgog'

In [62]:
print(encoded_messages)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [63]:
df2 = pd.DataFrame(encoded_messages)
df2.insert(0, 'spam', targets)

# Split into training and validation sets (80%/20% split)
split_index = int(np.ceil(df.shape[0] * 0.8))
train_set = df2[:split_index]
val_set = df2[split_index:]

In [64]:
train_set.to_csv('dataset/sms_train_set.gz', header=False, index=False, compression='gzip')
val_set.to_csv('dataset/sms_val_set.gz', header=False, index=False, compression='gzip')

In [65]:
import boto3

s3 = boto3.resource('s3')
target_bucket = s3.Bucket(bucket_name)

with open('dataset/sms_train_set.gz', 'rb') as data:
    target_bucket.upload_fileobj(data, '{0}/train/sms_train_set.gz'.format(bucket_key_prefix))
    
with open('dataset/sms_val_set.gz', 'rb') as data:
    target_bucket.upload_fileobj(data, '{0}/val/sms_val_set.gz'.format(bucket_key_prefix))



In [66]:
!cat 'sms_spam_classifier_mxnet_script.py'

from __future__ import print_function

import logging
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn
import numpy as np
import json
import time

import pip

try:
    from pip import main as pipmain
except:
    from pip._internal import main as pipmain

pipmain(['install', 'pandas'])
import pandas

#logging.basicConfig(level=logging.DEBUG)

# ------------------------------------------------------------ #
# Training methods                                             #
# ------------------------------------------------------------ #


def train(hyperparameters, input_data_config, channel_input_dirs, output_data_dir,
          num_gpus, num_cpus, hosts, current_host, **kwargs):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.
    ctx = mx.cpu()

    # retrieve the hyperparameters and apply some defaults

In [67]:
from sagemaker.mxnet import MXNet

output_path = 's3://{0}/{1}/output'.format(bucket_name, bucket_key_prefix)
code_location = 's3://{0}/{1}/code'.format(bucket_name, bucket_key_prefix)

m = MXNet('sms_spam_classifier_mxnet_script.py',
          role=role,
          train_instance_count=1,
          instance_type='ml.c5.2xlarge',
          output_path=output_path,
          base_job_name='sms-spam-classifier-mxnet',
          framework_version="1.2",
          py_version="py3",
          code_location = code_location,
          hyperparameters={'batch_size': 300,
                         'epochs': 10,
                         'learning_rate': 0.3})

inputs = {'train': 's3://{0}/{1}/train/'.format(bucket_name, bucket_key_prefix),
 'val': 's3://{0}/{1}/val/'.format(bucket_name, bucket_key_prefix)}

m.fit(inputs)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2022-05-05 02:23:13 Starting - Starting the training job...
2022-05-05 02:23:38 Starting - Preparing the instances for trainingProfilerReport-1651717392: InProgress
......
2022-05-05 02:24:38 Downloading - Downloading input data...
2022-05-05 02:25:02 Training - Training image download completed. Training in progress.[34m2022-05-05 02:25:04,136 INFO - root - running container entrypoint[0m
[34m2022-05-05 02:25:04,137 INFO - root - starting train task[0m
[34m2022-05-05 02:25:04,141 INFO - container_support.training - Training starting[0m
[34m2022-05-05 02:25:04,987 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'_scheduler_ip': '10.2.194.220', 'available_gpus': 0, 'enable_cloudwatch_metrics': False, 'channels': {'val': {'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicated', 'RecordWrapperType': 'None'}, 'train': {'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicated', 'RecordWrapperType': 'None'}}, 'sagemaker_region': 'us-east-1', 'user_scr

In [68]:
mxnet_pred = m.deploy(initial_instance_count=1,
                      instance_type='ml.m5.large')

------!

In [76]:
from sagemaker.mxnet.model import MXNetPredictor
from classifier_utilities import one_hot_encode
from classifier_utilities import vectorize_sequences

# Uncomment the following line to connect to an existing endpoint.
mxnet_pred = MXNetPredictor('sms-spam-classifier-mxnet-2022-05-05-02-26-26-135')

test_messages = ['''RT @ZenitsuStreams: God pls dont spoil @
#DoctorStrange #MultiverseOfMadness #Wanda https://t.co/UJYmar6kYb''']
one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)

print(encoded_test_messages)

result = mxnet_pred.predict(encoded_test_messages)
print(result)

[[0. 0. 0. ... 0. 0. 0.]]
{'predicted_label': [[1.0]], 'predicted_probability': [[0.9983275532722473]]}
