In [2]:
!pip install --upgrade pip
!pip install "sagemaker>=2.48.0" "transformers" "datasets[s3]" "tensorflow" "torch" --upgrade

[0m

In [3]:
import sagemaker
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

ModuleNotFoundError: No module named 'datasets'

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
df = pd.read_csv('s3://sagemaker-studio-3pwpe105sq2/review.csv')
df = df.dropna(axis=0, subset=['reviewText'])
df['sentiment'] = df['overall'].map({1:0, 2:0, 3:1, 4:1, 5:1})
df = df[["sentiment", "reviewText"]]

In [5]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

In [6]:
def tokenization(review):
    return tokenizer(review["reviewText"], truncation=True)

In [7]:
tok_data = dataset.map(tokenization, batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [8]:
tf_train_data = tok_data["train"].to_tf_dataset(
                            columns=['attention_mask', 'input_ids', 'token_type_ids'],
                            label_cols=['sentiment'],
                            shuffle=False,
                            collate_fn=data_collator,
                            batch_size=32)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# gets role for executing training job
role = sagemaker.get_execution_role()
hyperparameters = {
	'model_name_or_path':'bert-base-uncased',
	'output_dir':'/opt/ml/model',
    'do_train':True,
    'train_file':'s3://sagemaker-studio-3pwpe105sq2/review.csv',
    'num_train_epochs':5,
	# add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.17.0/examples/pytorch/text-classification
}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
	entry_point='run_glue.py',
	source_dir='./examples/pytorch/text-classification',
	instance_type='ml.p3.2xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.17.0',
	pytorch_version='1.10.2',
	py_version='py38',
	hyperparameters = hyperparameters
)

# starting the train job
huggingface_estimator.fit()