# Data Processing Notebook

# Import Libraries and Data

In [1]:
# import libraries
import pandas as pd
from transformers import RobertaTokenizer

In [2]:
# Load customer reviews data
df = pd.read_csv('data/Womens Clothing E-Commerce Reviews.csv', index_col = 0)

In [3]:
# Look at the columns of the dataframe
df.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

## Sample Dataset

In [4]:
# randomly sample 1000 rows from the dataframe
sample_df = df.sample(1000)
# remove any rows with missing values from sampled dataframe
sample_df = sample_df.dropna()

In [5]:
# reset the index of the sampled dataframe to start at 0
sample_df = sample_df.reset_index(drop = True)

In [6]:
sample_df.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

## Process the sampled data

In [7]:
# set the name of the pre-trained model and initialize the tokenizer
PRE_TRAINED_MODEL = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL, do_lower_case = True)

# set the maximum sequence length for the tokenization
max_seq_length = 128

# create empty lists to store the input ids and attention masks
input_ids = []
attention_mask = []

# Convert the text to required formatting for BERT
reviews = sample_df['Review Text'].values

for review in reviews:
    # tokenize the review using the tokenizer
    encode_plus = tokenizer.encode_plus(
        review,
        add_special_tokens = True,
        max_length = max_seq_length,
        return_token_type_ids = False,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation = True
    )
    # append the input ids and attention mask for the review to the lists
    input_ids.append(encode_plus['input_ids'])
    attention_mask.append(encode_plus['attention_mask'])

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:
# Here is what the encoding of the last review looks like. Returns a dictionary with the input id and the attention mask
encode_plus

{'input_ids': tensor([[    0,   133,  1468,    16,   372, 12846,   300,   106,    11,   258,
          8089,     8,   439,    19,    10,  6764,  1836,    13,    65,     4,
           961, 33391,   106,   328,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [9]:
# create an empty list to store the flattened input ids
input_ids_df = []

# iterate through the input ids and flatten them before appending to the list
for input_id in input_ids:
    input_ids_df.append(input_id.flatten().tolist())

In [10]:
sample_df['input_ids'] = input_ids_df
sample_df.head(1)

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,input_ids
0,1033,45,Love them,"Sits slightly below waist, i bought size 30p, ...",5,1,1,General Petite,Bottoms,Jeans,"[0, 104, 2629, 2829, 874, 13977, 6, 939, 2162,..."


In [11]:
# Convert ratings into sentiment
def convert_to_sentiment(rating):
    """
    Converts a numerical rating to a sentiment label

    Input:
        rating: A numeric rating between 1 and 5
    Output:
        int: a sentiment where -1 represents negative sentiment, 0 represents neutral, and 1 represents positive
    """
    if rating in {1,2}:
        return -1
    if rating == 3:
        return 0
    if rating in {4,5}:
        return 1
    
sample_df['sentiment'] = sample_df['Rating'].apply(lambda rating: convert_to_sentiment(rating))

In [12]:
# convert sentiments to label ids
def convert_sentiment_labelid(sentiment):
    """
    Convert sentiment label to Integer ID

    Input:
        sentiment: Sentiment label (-1 for negative, 0 for neutral, 1 for positive)
    Output:
        int: Integer ID (0 for negative, 1 for neutral, 2 for positive)
    """
    if sentiment == -1:
        return 0
    if sentiment == 0:
        return 1
    if sentiment == 1:
        return 2
    
sample_df['label_ids'] = sample_df['sentiment'].apply(lambda sentiment: convert_sentiment_labelid(sentiment))

In [13]:
sample_df.head(1)

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,input_ids,sentiment,label_ids
0,1033,45,Love them,"Sits slightly below waist, i bought size 30p, ...",5,1,1,General Petite,Bottoms,Jeans,"[0, 104, 2629, 2829, 874, 13977, 6, 939, 2162,...",1,2


In [14]:
sample_df.reset_index(inplace = True)

In [15]:
sample_df.head(1)

Unnamed: 0,index,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,input_ids,sentiment,label_ids
0,0,1033,45,Love them,"Sits slightly below waist, i bought size 30p, ...",5,1,1,General Petite,Bottoms,Jeans,"[0, 104, 2629, 2829, 874, 13977, 6, 939, 2162,...",1,2


In [16]:
# rename columns in dataframe
sample_df = sample_df.rename(columns = {'index': 'review_id',
                                        'Review Text': 'review_body'})

In [17]:
# Keep important columns
sample_df = sample_df[['review_id', 'sentiment', 'label_ids', 'input_ids', 'review_body']]
sample_df = sample_df.reset_index(drop = True)

In [18]:
sample_df.head(1)

Unnamed: 0,review_id,sentiment,label_ids,input_ids,review_body
0,0,1,2,"[0, 104, 2629, 2829, 874, 13977, 6, 939, 2162,...","Sits slightly below waist, i bought size 30p, ..."


In [19]:
# balance the dataset
# group the unbalanced dataset by sentiment class
df_sample_unbalanced = sample_df.groupby('sentiment')
df_sample_balanced = df_sample_unbalanced.apply(lambda x: x.sample(df_sample_unbalanced.size().min()).reset_index(drop = True))
df_sample = df_sample_balanced

In [20]:
# Add date feature 
import time
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
# add the timestampe to the sampled dataframe as a new column named 'date'
df_sample['date'] = timestamp

In [21]:
# Split data into train, test, validation
from sklearn.model_selection import train_test_split

df_train, df_holdout = train_test_split(df_sample, test_size = 0.1, stratify = df_sample['sentiment'])
df_validation, df_test = train_test_split(df_holdout, test_size = 0.5, stratify = df_holdout['sentiment'])

In [22]:
df_train = df_train.reset_index(drop = True)
df_validation = df_validation.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [26]:
df_train.dtypes

review_id       int64
sentiment       int64
label_ids       int64
input_ids      object
review_body    object
date           object
dtype: object

In [27]:
# Convert object types to string
def object_to_string(dataframe):
    """
    This function converts the data types in given pandas dataframe to string data types
    
    Input:
        dataframe: the pandas dataframe we are working with
    Output:
        dataframe: the same pandas dataframe, however columns are now string types
    """
    # Loop through each column in the dataframe
    for column in dataframe.columns:
        # check if the column is of object data type
        if dataframe.dtypes[column] == 'object':
            # convert the column to string type
            dataframe[column] = dataframe[column].astype('str').astype('string')
    # return the converted dataframe
    return dataframe

df_train = object_to_string(df_train)
df_validation = object_to_string(df_validation)
df_test = object_to_string(df_test)

In [28]:
df_train.dtypes

review_id       int64
sentiment       int64
label_ids       int64
input_ids      string
review_body    string
date           string
dtype: object

# SageMaker Processing Job

In [39]:
import sagemaker
import boto3

# initialize SageMaker session and get the default S3 bucket name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# get the IAM role for the current notebook instance
role = sagemaker.get_execution_role()

# get the region name of the current AWS session using Boto3
region = boto3.Session().region_name

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [46]:
# set the processing instance type and count for the sagemaker processing job
processing_instance_type='ml.c5.xlarge'
processing_instance_count=1

# set the percentage of data to use for training, validation, and test set
train_split_percentage=0.90
validation_split_percentage=0.05
test_split_percentage=0.05

# set a flag to balance the dataset
balance_dataset=True

# set the maximum sequence length
max_seq_length=128

In [47]:
from sagemaker.sklearn.processing import SKLearnProcessor

# create an instance of the SKLearnProcessor class with the following configuration
processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': region},                             
    max_runtime_in_seconds=7200 # the maximum amount of time to allow for the processing job in seconds
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


The code below runs a SageMaker processing job that executes 'prepare-data.py' script with the specified input data, output data, and arguments.

The input data is stored in the default s3 bucket that we initialized earlier.

The ouput data is stored in the 'sentiment-train', 'sentiment-validation' and 'sentiment-test' folders respectively.

In [48]:
raw_input_data_s3_uri = 's3://{}/data/csv/'.format(bucket)

from sagemaker.processing import ProcessingInput, ProcessingOutput

processor.run(code = 'src/prepare-data.py',
             inputs = [
                 ProcessingInput(source = raw_input_data_s3_uri,
                                 destination = '/opt/ml/processing/input/data/',
                                 s3_data_distribution_type = 'ShardedByS3Key')
             ],
              outputs=[
                    ProcessingOutput(output_name='sentiment-train',
                                     source='/opt/ml/processing/output/sentiment/train',
                                     s3_upload_mode='EndOfJob'),
                    ProcessingOutput(output_name='sentiment-validation',
                                     source='/opt/ml/processing/output/sentiment/validation',
                                     s3_upload_mode='EndOfJob'),
                    ProcessingOutput(output_name='sentiment-test',
                                     source='/opt/ml/processing/output/sentiment/test',
                                     s3_upload_mode='EndOfJob')
             ],
              arguments=['--train-split-percentage', str(train_split_percentage),
                         '--validation-split-percentage', str(validation_split_percentage),
                         '--test-split-percentage', str(test_split_percentage),
                         '--balance-dataset', str(balance_dataset),
                         '--max-seq-length', str(max_seq_length),                                                  
              ],
              logs=True, # set to true to enable logging of processing job output
              wait=False) 

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2023-01-06-17-33-49-347



Job Name:  sagemaker-scikit-learn-2023-01-06-17-33-49-347
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-003294323742/data/csv/', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/input/code/prepare-data.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'sentiment-train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-train', 'LocalPath': '/opt/ml/processing/output/sentiment/train', 'S3UploadMode': 'EndOfJob'}}, {'Out

In [49]:
# get the name of the latest processing job
scikit_processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']

print('Processing job name: {}'.format(scikit_processing_job_name))

Processing job name: sagemaker-scikit-learn-2023-01-06-17-33-49-347


In [50]:
# get the status of the latest processing job
scikit_processing_job_status = processor.jobs[-1].describe()['ProcessingJobStatus'] # Replace None
print('Processing job status: {}'.format(scikit_processing_job_status))

Processing job status: InProgress


In [51]:
%%time

running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=scikit_processing_job_name,
    sagemaker_session=sagemaker_session
)

running_processor.wait(logs=False)

...........................................................................!CPU times: user 326 ms, sys: 30.3 ms, total: 356 ms
Wall time: 6min 16s


In [52]:
processing_job_description = running_processor.describe()

# retrieve the output s3 uri for the processed data from the completed processing job
output_config = processing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'sentiment-train':
        processed_train_data_s3_uri = output['S3Output']['S3Uri']
    if output['OutputName'] == 'sentiment-validation':
        processed_validation_data_s3_uri = output['S3Output']['S3Uri']
    if output['OutputName'] == 'sentiment-test':
        processed_test_data_s3_uri = output['S3Output']['S3Uri']
        
print(processed_train_data_s3_uri)
print(processed_validation_data_s3_uri)
print(processed_test_data_s3_uri)

s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-train
s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-validation
s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-test


In [53]:
!aws s3 ls $processed_train_data_s3_uri/

2023-01-06 17:39:59    4895996 train.csv


In [54]:
!aws s3 ls $processed_validation_data_s3_uri/

2023-01-06 17:40:00     278047 validation.csv


In [55]:
!aws s3 ls $processed_test_data_s3_uri/


2023-01-06 17:40:00     274177 test.csv


In [56]:
!aws s3 cp $processed_train_data_s3_uri/train.csv ./balanced/sentiment-train/
!aws s3 cp $processed_validation_data_s3_uri/validation.csv ./balanced/sentiment-validation/
!aws s3 cp $processed_test_data_s3_uri/test.csv ./balanced/sentiment-test/

download: s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-train/train.csv to balanced/sentiment-train/train.csv
download: s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-validation/validation.csv to balanced/sentiment-validation/validation.csv
download: s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-test/test.csv to balanced/sentiment-test/test.csv
