Model training. Data has alreday been setup - test_sentiment140.csv_noquotes.csv & training_sentiment140_noquotes.csv

Exmplanation of Hyperparameters here: https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext_hyperparameters.html

In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

In [2]:
sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = 'BUCKET_NAME' #sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'Sentiment140/supervised' #Replace with the prefix under which you want to store the data if needed

arn:aws:iam::459367279383:role/service-role/AmazonSageMaker-ExecutionRole-20190801T095874
venkat1-ml-sagemaker


In [3]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='training_sentiment140_noquotes.csv', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='test_sentiment140.csv_noquotes.csv', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 1.23 s, sys: 403 ms, total: 1.63 s
Wall time: 5 s


In [4]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [5]:
region_name = boto3.Session().region_name

container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


### Here is where we Run the Model

In [6]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [7]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=45,   #tried 10 and 20
                            min_count=2,
                            learning_rate=0.001, #tried 0.05 and 0.02
                            vector_dim=100,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=3)

In [8]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [9]:
bt_model.fit(inputs=data_channels, logs=True)

2019-12-16 14:49:53 Starting - Starting the training job...
2019-12-16 14:49:54 Starting - Launching requested ML instances......
2019-12-16 14:50:58 Starting - Preparing the instances for training...
2019-12-16 14:51:49 Downloading - Downloading input data...
2019-12-16 14:52:13 Training - Downloading the training image..[34mArguments: train[0m
[34m[12/16/2019 14:52:26 INFO 139714800158528] nvidia-smi took: 0.0251891613007 secs to identify 0 gpus[0m
[34m[12/16/2019 14:52:26 INFO 139714800158528] Running single machine CPU BlazingText training using supervised mode.[0m
[34m[12/16/2019 14:52:26 INFO 139714800158528] Processing /opt/ml/input/data/train/training_sentiment140_noquotes.csv . File size: 93 MB[0m
[34m[12/16/2019 14:52:26 INFO 139714800158528] Processing /opt/ml/input/data/validation/test_sentiment140.csv_noquotes.csv . File size: 40 MB[0m
[34mRead 10M words[0m
[34mRead 16M words[0m
[34mNumber of words:  178971[0m
[34mLoading validation data from /opt/ml/input

[34m-------------- End of epoch: 36[0m
[34mUsing 16 threads for prediction![0m
[34mValidation accuracy: 0.817496[0m
[34mValidation accuracy has not improved for last 4 epochs.[0m
[34mReached patience. Terminating training.[0m
[34mBest epoch: 32[0m
[34mBest validation accuracy: 0.817858[0m
[34m##### Alpha: 0.0000  Progress: 100.00%  Million Words/sec: 12.04 #####[0m

2019-12-16 14:53:48 Uploading - Uploading generated training model[34mTraining finished.[0m
[34mAverage throughput in Million words/sec: 12.04[0m
[34mTotal training time in seconds: 61.47
[0m
[34m#train_accuracy: 0.8819[0m
[34mNumber of train examples: 1120000
[0m
[34m#validation_accuracy: 0.8179[0m
[34mNumber of validation examples: 480000[0m

2019-12-16 14:55:30 Completed - Training job completed
Training seconds: 221
Billable seconds: 221


### Hosting


In [10]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

---------------------------------------------------------------------------------------------------------------------------!

In [12]:
import nltk
nltk.download('punkt')

sentences = ["One does not simply walk into Mordor",
             "One does not simply walk into Chicago",
             "Hey, Connor. Money in Hand is good, money in head is bad",
            "what was he thinking?", 
             "morning all... looks like its gonna be a rainy day ",
            "hard to be witty when one is at their wits end",
            "Crazy day tomorow. Meetings all day",
            "looking forward to the weekend",
            "A person called asking for Information",
            "A HCP called asking for Information",
            "A HCP called asking for Information on ProductX",
            "A HCP called asking for Information on QETUOD",
            "A HCP called and told us to go fly a kite",
            "A HCP called and told us to go to hell"]

# using the same nltk tokenizer that we used during data preparation for training
#tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
# NOTE: Earlier version of code had tokenizer. Bad idea for sentiment analysis since the placement of words matters. 
# keeping commented code here as a reminder

payload = {"instances" : sentences,
          "configuration": {"k": 2}}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[
  {
    "prob": [
      0.7255306243896484,
      0.2744893729686737
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.717772364616394,
      0.2822476327419281
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.7200742959976196,
      0.2799457311630249
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.7346668243408203,
      0.26535317301750183
    ],
    "label": [
      "__label__Positive",
      "__label__Negative"
    ]
  },
  {
    "prob": [
      0.8802874088287354,
      0.11973258852958679
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.8215214610099792,
      0.1784985214471817
    ],
    "label": [
      "__la

### Close

In [12]:
sess.delete_endpoint(text_classifier.endpoint)