In [1]:
import pandas as pd 
import numpy as np

import re

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
!pip install sagemaker==1.72.0

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/mxnet_p36/bin/python -m pip install --upgrade pip' command.[0m


## Load data

In [3]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [4]:
def clean_text(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    rm_links = re.sub(r"http\S+", " ", string)
    rm_pn = re.sub(r'[^\w\s]+', ' ', rm_links)
    rm_sc = re.sub('\?|\.|\!|\/|\;|\:', ' ', rm_pn)
    tk = word_tokenize(rm_pn)
    clean_text = [word.lower() for word in tk if word.lower() not in stop_words]
    reformed_text = ' '.join(clean_text)
    return reformed_text

def review_to_words(review):
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    text = clean_text(text)
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = ' '.join([PorterStemmer().stem(w) for w in words]) # stem
    
    return words

In [5]:
train_data["clean_text"] = train_data["text"].apply(lambda row:review_to_words(row))
test_data["clean_text"] = test_data["text"].apply(lambda row:review_to_words(row))

## use TfidfTransformer to generate feature for test and train set

In [6]:
pipe = Pipeline([('count', CountVectorizer()),
                 ('tfid', TfidfTransformer())])

## split data into train and evaluation set
train, evaluation = train_test_split(train_data, test_size=0.2)

train_feature = pipe.fit_transform(train["clean_text"]).toarray()
test_feature = pipe.transform(test_data["clean_text"] ).toarray()
evaluation_feature = pipe.transform(evaluation["clean_text"]).toarray()


print(train_feature.shape)
print(test_feature.shape)

(6090, 13643)
(3263, 13643)


## Save data to csv file and upload

In [7]:
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session() # Store the current SageMaker session
role = get_execution_role()


In [8]:
import os

data_dir = '../data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## save data to csv file
train_df = pd.concat([train["target"],pd.DataFrame(train_feature)], axis=1)
train_df.to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
evaluation_df = pd.concat([evaluation["target"],pd.DataFrame(evaluation_feature)], axis=1)
evaluation_df.columns = train_df.columns
evaluation_df.to_csv(os.path.join(data_dir, 'evaluation.csv'), header=False, index=False)

pd.DataFrame(test_feature).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

# upload to s3
prefix = 'predict-disaster'
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
evaluation_location = session.upload_data(os.path.join(data_dir, 'evaluation.csv'), key_prefix=prefix)
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

## Train the model with XGBoost

In [9]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(session.boto_region_name, 'xgboost', '1.0-1')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [10]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [11]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:f1', # The metric used to compare trained models.
                                               objective_type = 'Maximize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 6, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [12]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=evaluation_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [13]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})
xgb_hyperparameter_tuner.wait()

...........................................................................................................................................!


## Retrain model with best parameter and predict data on test set

In [15]:
#xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())
#xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


xgb.set_hyperparameters(max_depth=3,
                        eta=0.4,
                        gamma=6.5,
                        min_child_weight=6,
                        subsample=0.78,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

train_feature = pipe.fit_transform(train_data["clean_text"]).toarray()
train_df = pd.concat([train_data["target"],pd.DataFrame(train_feature)], axis=1)
train_df.to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
xgb.fit({'train': s3_input_train})

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-08-01 15:39:04 Starting - Starting the training job...
2021-08-01 15:39:08 Starting - Launching requested ML instances......
2021-08-01 15:40:34 Starting - Preparing the instances for training............
2021-08-01 15:42:08 Downloading - Downloading input data......
2021-08-01 15:43:23 Training - Downloading the training image...
2021-08-01 15:43:45 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[15:43:54] 7613x15774 matrix with 12008

In [16]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


................................[34m[2021-08-01:15:52:23:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-01:15:52:23:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-01:15:52:23:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

 

UnexpectedStatusException: Error for Transform job sagemaker-xgboost-2021-08-01-15-47-13-554: Failed. Reason: ClientError: See job logs for more information

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

prediction = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
prediction = [round(num) for num in prediction.squeeze().values]

predict = pd.DataFrame(prediction)
predict["id"] = test_data["id"].values
predict["target"] = prediction
predict[["id","target"]].to_csv("predict_boostrap_hyptertuning.csv", index=False)

## Evaluation

The result is submitted to Kaggle, the score is 0.782

## Manually Test

In [21]:
manual_df = pd.read_csv("manual_test.csv",header=None)
manual_df.columns = ["text"]
manual_df.head()

Unnamed: 0,text
0,The weather is nice
1,The storm is coming
2,There is no storm


In [24]:
manual_df["clean_text"] = manual_df["text"].apply(lambda row:review_to_words(row))
manual_test_feature = pipe.transform(manual_df["clean_text"] ).toarray()
pd.DataFrame(manual_test_feature).to_csv(os.path.join(data_dir, 'manul_test.csv'), header=False, index=False)
manul_test_location = session.upload_data(os.path.join(data_dir, 'manul_test.csv'), key_prefix=prefix)

In [26]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(manul_test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: sagemaker-xgboost-2021-08-01-15-39-04-500


..................................[34m[2021-08-01:16:06:38:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-01:16:06:38:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-01:16:06:38:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[35m[2021-08-01:16:06:38:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2021-08-01:16:06:38:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2021-08-01:16:06:38:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;
[0m
[35mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/guni

In [27]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

prediction = pd.read_csv(os.path.join(data_dir, 'manul_test.csv.out'), header=None)
prediction = [round(num) for num in prediction.squeeze().values]

print(prediction)

download: s3://sagemaker-us-east-1-099005516989/sagemaker-xgboost-2021-08-01-16-01-11-021/manul_test.csv.out to ../data/manul_test.csv.out
[0, 1, 1]
