In [49]:
import os
import sagemaker
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_header, strip_newsgroup_quoting, strip_newsgroup_footer

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')['data']
newsgroups_test = fetch_20newsgroups(subset='test')['data']

NUM_TOPICS = 30
NUM_NEIGHBORS = 10
BUCKET = 'sagemaker-xl'
PREFIX = '20newsgroups'

In [3]:
# preprocss the data
for i in range(len(newsgroups_train)):
    newsgroups_train[i] = strip_newsgroup_header(newsgroups_train[i])
    newsgroups_train[i] = strip_newsgroup_quoting(newsgroups_train[i])
    newsgroups_train[i] = strip_newsgroup_footer(newsgroups_train[i])

In [4]:
newsgroups_train[1]

"A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks."

In [5]:
!pip install nltk

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
import nltk
import re

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
token_pattern = re.compile(r"(?u)\b\w\w+\b")

In [8]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if len(t) >= 2 and re.match("[a-z].*", t)
                and re.match(token_pattern, t)]

In [9]:
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vocab_size = 2000

print('Tokenizing and counting, this may take a few minutes...')

start_time = time.time()
vectorizer = CountVectorizer(input='content', analyzer='word', stop_words='english',
                             tokenizer=LemmaTokenizer(), max_features=vocab_size, max_df=0.95, min_df=2)

vectors = vectorizer.fit_transform(newsgroups_train)
vocab_list = vectorizer.get_feature_names()

print('vocab size:', len(vocab_list))

Tokenizing and counting, this may take a few minutes...


  'stop_words.' % sorted(inconsistent))


vocab size: 2000


In [11]:
# random shuffle
idx = np.arange(vectors.shape[0])
newidx = np.random.permutation(idx)  # the label fed into the KNN model

# store the permutations
vectors = vectors[newidx]

print('Done. Time elapsed: {:.2f}s'.format(time.time() - start_time))

Done. Time elapsed: 99.45s


In [12]:
# convert vectors to a sparse representation
import scipy.sparse as sparse

vectors = sparse.csr_matrix(vectors, dtype=np.float32)

print(type(vectors), vectors.dtype)

<class 'scipy.sparse.csr.csr_matrix'> float32


In [13]:
n_train = int(0.8*vectors.shape[0])

train_vectors = vectors[:n_train, :]
val_vectors = vectors[n_train:, :]

print(train_vectors.shape,val_vectors.shape)

(9051, 2000) (2263, 2000)


In [14]:
from sagemaker import get_execution_role

role = get_execution_role()

bucket = BUCKET
prefix = PREFIX

train_prefix = os.path.join(prefix, 'train')
val_prefix = os.path.join(prefix, 'val')
output_prefix = os.path.join(prefix, 'output')

In [15]:
s3_train_data = os.path.join('s3://', bucket, train_prefix)
s3_val_data = os.path.join('s3://', bucket, val_prefix)
output_path = os.path.join('s3://', bucket, output_prefix)

print('Training set location', s3_train_data)
print('Validation set location', s3_val_data)
print('Trained model will be saved at', output_path)

Training set location s3://sagemaker-xl/20newsgroups/train
Validation set location s3://sagemaker-xl/20newsgroups/val
Trained model will be saved at s3://sagemaker-xl/20newsgroups/output


In [19]:
# convert raw vectors into RecordIO format
# and use the n_parts parameter
def split_convert_upload(sparray, bucket, prefix, fname_template='data_part{}.pbr', n_parts=2):
    import io
    import boto3
    import sagemaker.amazon.common as smac
    
    chunk_size = sparray.shape[0] // n_parts
    
    for i in range(n_parts):
        # calculate start and end indices
        start = i * chunk_size
        end = (i + 1) * chunk_size
        
        if i + 1 == n_parts:
            end = sparray.shape[0]
        
        # convert to record protobuf
        buf = io.BytesIO()
        smac.write_spmatrix_to_sparse_tensor(array=sparray[start:end], file=buf, labels=None)
        buf.seek(0)
        
        # upload to amazon s3
        fname = os.path.join(prefix, fname_template.format(i))
        boto3.resource('s3').Bucket(bucket).Object(fname).upload_fileobj(buf)
        
        print('Uploaded data to s3://{}'.format(os.path.join(bucket, fname)))

split_convert_upload(train_vectors, bucket=bucket, prefix=train_prefix, fname_template='train_part{}.pbr', n_parts=8)
split_convert_upload(val_vectors, bucket=bucket, prefix=val_prefix, fname_template='val_part{}.pbr', n_parts=1)

Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part0.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part1.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part2.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part3.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part4.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part5.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part6.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/train/train_part7.pbr
Uploaded data to s3://sagemaker-xl/20newsgroups/val/val_part0.pbr


In [20]:
import boto3

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'ntm')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [22]:
sess = sagemaker.Session()
ntm = sagemaker.estimator.Estimator(container, role, train_instance_count=2,
                                   train_instance_type='ml.c4.xlarge',
                                   output_path=output_path, sagemaker_session=sess)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [23]:
ntm.set_hyperparameters(num_topics=NUM_TOPICS, feature_dim=vocab_size, mini_batch_size=128, 
                        epochs=100, num_patience_epochs=5, tolerance=0.001)

In [25]:
from sagemaker.session import s3_input

s3_train = s3_input(s3_train_data, distribution='ShardedByS3Key')
ntm.fit({'train': s3_train, 'test': s3_val_data})

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-02-15 06:42:25 Starting - Starting the training job...
2021-02-15 06:42:47 Starting - Launching requested ML instancesProfilerReport-1613371344: InProgress
......
2021-02-15 06:43:53 Starting - Preparing the instances for training.........
2021-02-15 06:45:19 Downloading - Downloading input data...
2021-02-15 06:45:50 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[02/15/2021 06:45:47 INFO 140391001810112] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_patience_epochs': u'3', u'clip_gradient': u'Inf', u'encoder_layers': u'auto', u'optimizer': u'adadelta', u'_kvstore': u'auto_gpu', u'rescale_gradient': u'1.0', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'learning_rate': u'0.01', u'_data_format': u'record', u'sub_sample': u'

In [36]:
## deploy the topic model
## option A: deploy the model with SageMaker hosting services
ntm_predictor = ntm.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------!

In [38]:
from sagemaker.predictor import csv_serializer, json_deserializer

# ntm_predictor.content_type = 'text/csv'
ntm_predictor.serializer = csv_serializer
ntm_predictor.deserializer = json_deserializer

In [40]:
# extract the topic vectors for the training data which will be used in KNN
predictions = []
for item in np.array(vectors.todense()):
    np.shape(item)
    results = ntm_predictor.predict(item)
    predictions.append(np.array([prediction['topic_weights'] for prediction in results['predictions']]))
    
predictions = np.array([np.ndarray.flatten(x) for x in predictions])
topicvec = train_labels[newidx]
topicnames = [categories[x] for x in topicvec]

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_seri

KeyboardInterrupt: 

In [31]:
## option B: deploy with batch transform
np.savetxt('trainvectors.csv',
           vectors.todense(),
           delimiter=',',
           fmt='%i')
batch_prefix = '20newsgroups/batch'

train_s3 = sess.upload_data('trainvectors.csv', 
                            bucket=bucket, 
                            key_prefix='{}/train'.format(batch_prefix))

print(train_s3)

batch_output_path = 's3://{}/{}/test'.format(bucket, batch_prefix)

ntm_transformer = ntm.transformer(instance_count=1,
                                  instance_type ='ml.m4.xlarge',
                                  output_path=batch_output_path
                                 )
ntm_transformer.transform(train_s3, content_type='text/csv', split_type='Line')
ntm_transformer.wait()

s3://sagemaker-xl/20newsgroups/batch/train/trainvectors.csv
..................................[32m2021-02-15T07:50:27.554:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[02/15/2021 07:50:27 INFO 140559675376832] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[02/15/2021 07:50:27 INFO 140559675376832] loading entry points[0m
[35mDocker entrypoint called with argument(s): serve[0m
[35mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[35m[02/15/2021 07:50:27 INFO 140559675376832] loaded entry point class algorithm.serve.server_config:config_api[0m
[35m[02/15/2021 07:50:27 INFO 140559675376832] loading entry points[0m
[34m[02/15/2021 07:50:27 INFO 140559675376832] loaded request iterator text/csv[0

In [32]:
# once the transform is done, download the outputs back to local notebook instance for inspection
!aws s3 cp --recursive $ntm_transformer.output_path ./
!head -c 5000 trainvectors.csv.out

download: s3://sagemaker-xl/20newsgroups/batch/test/trainvectors.csv.out to ./trainvectors.csv.out
{"topic_weights":[0.0176811125,0.0177120008,0.0198236313,0.0155713204,0.0370621458,0.019373972,0.0329566523,0.0248791873,0.0209578648,0.0368159488,0.0210807733,0.0163670015,0.0150072491,0.305621922,0.0184352715,0.0254585743,0.0254659783,0.0621491335,0.0402800441,0.0228296835,0.0182546265,0.0164231956,0.0294983257,0.0167901646,0.0195879862,0.0253007896,0.0249296222,0.0119190179,0.022343887,0.0194228888]}
{"topic_weights":[0.0217818506,0.0221054517,0.0209346376,0.0147795798,0.0288178343,0.0195078794,0.043546889,0.0219893213,0.0236818846,0.0440922901,0.0279528257,0.1586860567,0.0159528498,0.0142511055,0.0281842519,0.0314711705,0.0351098254,0.047913488,0.0610321723,0.0294166282,0.0194408093,0.0185179617,0.0251341276,0.0185227469,0.0201560762,0.0327243693,0.036396794,0.067391634,0.0287253447,0.0217822287]}
{"topic_weights":[0.0296324547,0.0311008636,0.0316695571,0.0197660085,0.0358408168,0.026

In [33]:
from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=5000)
tsne_results = tsne.fit_transform(predictions)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

tsnedf = pd.DataFrame()
tsnedf['tsne-2d-one'] = tsne_results[:,0]
tsnedf['tsne-2d-two'] = tsne_results[:,1]
tsnedf['Topic'] = topicnames

plt.figure(figsize=(25,25))

sns.lmplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue='Topic',
    palette=sns.color_palette("hls", NUM_TOPICS),
    data=tsnedf,
    legend="full",
    fit_reg=False
)

plt.axis('Off')
plt.show()

NameError: name 'predictions' is not defined

In [34]:
labels = newidx 
labeldict = dict(zip(newidx,idx))

In [41]:
import io
import sagemaker.amazon.common as smac


print('train_features shape = ', predictions.shape)
print('train_labels shape = ', labels.shape)
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, predictions, labels)
buf.seek(0)

bucket = BUCKET
prefix = PREFIX
key = 'knn/train'
fname = os.path.join(prefix, key)
print(fname)
boto3.resource('s3').Bucket(bucket).Object(fname).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

AttributeError: 'list' object has no attribute 'shape'

In [42]:
def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None):
    """
    Create an Estimator from the given hyperparams, fit to training data, 
    and return a deployed predictor
    
    """
    # set up the estimator
    knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        train_instance_count=1,
        train_instance_type='ml.c4.xlarge',
        output_path=output_path,
        sagemaker_session=sagemaker.Session())
    knn.set_hyperparameters(**hyperparams)
    
    # train a model. fit_input contains the locations of the train and test data
    fit_input = {'train': s3_train_data}
    knn.fit(fit_input)
    return knn

hyperparams = {
    'feature_dim': predictions.shape[1],
    'k': NUM_NEIGHBORS,
    'sample_size': predictions.shape[0],
    'predictor_type': 'classifier' ,
    'index_metric':'COSINE'
}
output_path = 's3://' + bucket + '/' + prefix + '/knn/output'
knn_estimator = trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path)


AttributeError: 'list' object has no attribute 'shape'

In [43]:
def predictor_from_estimator(knn_estimator, estimator_name, instance_type, endpoint_name=None): 
    knn_predictor = knn_estimator.deploy(initial_instance_count=1, instance_type=instance_type,
                                        endpoint_name=endpoint_name,
                                        accept="application/jsonlines; verbose=true")
    knn_predictor.content_type = 'text/csv'
    knn_predictor.serializer = csv_serializer
    knn_predictor.deserializer = json_deserializer
    
    return knn_predictor

import time

instance_type = 'ml.m4.xlarge'
model_name = 'knn_%s'% instance_type
endpoint_name = 'knn-ml-m4-xlarge-%s'% (str(time.time()).replace('.','-'))

print('setting up the endpoint..')

knn_predictor = predictor_from_estimator(knn_estimator, model_name, instance_type, endpoint_name=endpoint_name)

setting up the endpoint..


NameError: name 'knn_estimator' is not defined

In [50]:
def preprocess_input(text):
    text = strip_newsgroup_header(text)
    text = strip_newsgroup_quoting(text)
    text = strip_newsgroup_footer(text)
    return text    
    
test_data_prep = []
for i in range(len(newsgroups_test)):
    test_data_prep.append(preprocess_input(newsgroups_test[i]))
test_vectors = vectorizer.fit_transform(test_data_prep)

test_vectors = np.array(test_vectors.todense())
test_topics = []
for vec in test_vectors:
    test_result = ntm_predictor.predict(vec)
    test_topics.append(test_result['predictions'][0]['topic_weights'])

topic_predictions = []
for topic in test_topics:
    result = knn_predictor.predict(topic)
    cur_predictions = np.array([int(result['labels'][i]) for i in range(len(result['labels']))])
    topic_predictions.append(cur_predictions[::-1][:10])       

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint ntm-2021-02-15-07-55-45-667 of account 900019131056 not found.

In [51]:
# set your own k.
def plot_topic_distribution(topic_num, k = 5):
    
    closest_topics = [predictions[labeldict[x]] for x in topic_predictions[topic_num][:k]]
    closest_topics.append(np.array(test_topics[topic_num]))
    closest_topics = np.array(closest_topics)
    df = pd.DataFrame(closest_topics.T)
    df.rename(columns ={k:"Test Document Distribution"}, inplace=True)
    fs = 12
    df.plot(kind='bar', figsize=(16,4), fontsize=fs)
    plt.ylabel('Topic assignment', fontsize=fs+2)
    plt.xlabel('Topic ID', fontsize=fs+2)
    plt.show()

In [52]:
plot_topic_distribution(18)

IndexError: list index out of range

In [53]:
plot_topic_distribution(25)
# plot_topic_distribution(5000)

IndexError: list index out of range