Model training. Data has alreday been setup - test_sentiment140.csv_noquotes.csv & training_sentiment140_noquotes.csv

Exmplanation of Hyperparameters here: https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext_hyperparameters.html

In [7]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

In [8]:
sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = 'venkat1-ml-sagemaker' #sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'Sentiment140/supervised' #Replace with the prefix under which you want to store the data if needed

arn:aws:iam::459367279383:role/service-role/AmazonSageMaker-ExecutionRole-20190801T095874
venkat1-ml-sagemaker


In [9]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='training_sentiment140_noquotes.csv', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='test_sentiment140.csv_noquotes.csv', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 1.28 s, sys: 408 ms, total: 1.68 s
Wall time: 8.43 s


In [10]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [11]:
region_name = boto3.Session().region_name

container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


### Here is where we Run the Model

In [12]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [13]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [14]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [15]:
bt_model.fit(inputs=data_channels, logs=True)

2019-10-21 18:49:47 Starting - Starting the training job...
2019-10-21 18:49:57 Starting - Launching requested ML instances......
2019-10-21 18:51:01 Starting - Preparing the instances for training......
2019-10-21 18:52:10 Downloading - Downloading input data
2019-10-21 18:52:10 Training - Downloading the training image..[31mArguments: train[0m
[31m[10/21/2019 18:52:24 INFO 139996456646464] nvidia-smi took: 0.0252268314362 secs to identify 0 gpus[0m
[31m[10/21/2019 18:52:24 INFO 139996456646464] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[10/21/2019 18:52:24 INFO 139996456646464] Processing /opt/ml/input/data/train/training_sentiment140_noquotes.csv . File size: 93 MB[0m
[31m[10/21/2019 18:52:24 INFO 139996456646464] Processing /opt/ml/input/data/validation/test_sentiment140.csv_noquotes.csv . File size: 40 MB[0m
[31mRead 10M words[0m
[31mRead 16M words[0m
[31mNumber of words:  178971[0m
[31mLoading validation data from /opt/ml/input

### Hosting


In [16]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

---------------------------------------------------------------------------------------------------!

In [22]:
import nltk

sentences = ["Hey, Connor. Money in Hand is good, money in head is bad",
            "what was he thinking?", 
             "morning all... looks like its gonna be a rainy day ",
            "hard to be witty when one is at their wits end",
            "Crazy day tomorow. Meetings all day",
            "looking forward to the weekend"]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences,
          "configuration": {"k": 2}}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.9779990315437317,
      0.022020963951945305
    ],
    "label": [
      "__label__Positive",
      "__label__Negative"
    ]
  },
  {
    "prob": [
      0.9650333523750305,
      0.034986693412065506
    ],
    "label": [
      "__label__Positive",
      "__label__Negative"
    ]
  },
  {
    "prob": [
      0.8431633710861206,
      0.15685658156871796
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.8228063583374023,
      0.17721359431743622
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.6642078757286072,
      0.33581212162971497
    ],
    "label": [
      "__label__Negative",
      "__label__Positive"
    ]
  },
  {
    "prob": [
      0.9880969524383545,
      0.01192310731858015
    ],
    "label": [
      "__label__Positive",
      "__label__Negative"
    ]
  }
]


### Close

In [23]:
sess.delete_endpoint(text_classifier.endpoint)