In [1]:
import wandb, os

## Datasets Library
> We will prepare the data for the transformers library

In [2]:
from datasets import load_dataset, Features, ClassLabel, Value

In [3]:
wandb.init(project="aws_demo", job_type="get_data")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m (use `wandb login --relogin` to force relogin)


we can grab the preprocessed dataset direcly from wandb:

In [4]:
dataset_path  = wandb.use_artifact("capecape/aws_demo/splitted_dataset:latest").download()

In [5]:
dataset_path

'./artifacts/splitted_dataset:v0'

In [6]:
split_at = wandb.Artifact("splitted_dataset", type="dataset")

In [7]:
labels = ["negative", "positive"]
stock_features = Features({'Text': Value('string'), 
                           'labels': ClassLabel(names=labels)})

In [8]:
dataset = load_dataset('csv', data_files={"train": os.path.join(dataset_path,"train.csv"), 
                                          "test": os.path.join(dataset_path, "test.csv")}, 
                       delimiter=',', 
                       features=stock_features)

Using custom data configuration default-d8dfb5a2e7af0ec3
Reusing dataset csv (/home/paperspace/.cache/huggingface/datasets/csv/default-d8dfb5a2e7af0ec3/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/2 [00:00<?, ?it/s]

we get a `DatasetDict` object containing our split

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'labels'],
        num_rows: 5212
    })
    test: Dataset({
        features: ['Text', 'labels'],
        num_rows: 579
    })
})

# A Simple bert

In [10]:
import numpy as np
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/paperspace/.cache/huggingface/datasets/csv/default-d8dfb5a2e7af0ec3/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-055f664b758555b4.arrow
Loading cached processed dataset at /home/paperspace/.cache/huggingface/datasets/csv/default-d8dfb5a2e7af0ec3/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-946972fb275e0871.arrow


In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5212
    })
    test: Dataset({
        features: ['Text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 579
    })
})

In [13]:
default_training_args = {
    'per_device_train_batch_size': 32,
    'per_device_eval_batch_size': 32,
    'num_train_epochs': 1,
    'learning_rate': 2e-5,
    'evaluation_strategy': 'epoch',
    'save_strategy': 'epoch',
    'save_total_limit': 2,
    'logging_strategy': 'steps',
    'logging_first_step': True,
    'logging_steps': 5,
    'report_to': 'wandb',
    'fp16':True
}

In [14]:
recall_metric = load_metric("recall")
f1_metric = load_metric('f1')
accuracy_metric = load_metric('accuracy')
precision_metric = load_metric('precision')

In [15]:
from wandb.sdk.integration_utils.data_logging import ValidationDataLogger

validation_inputs = tokenized_datasets['test'].remove_columns(['labels', 'attention_mask', 'input_ids', 'token_type_ids'])
validation_targets = [tokenized_datasets['test'].features['labels'].int2str(x) for x in dataset['test']['labels']]

validation_logger = ValidationDataLogger(inputs = validation_inputs[:],targets = validation_targets)

In [16]:
def compute_metrics(eval_pred):
    "Get a bunch of metrics and log predictions to wandb"
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')['recall']
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')['f1']
    accuracy = accuracy_metric.compute(predictions=predictions,references=labels)['accuracy']
    precision = precision_metric.compute(predictions=predictions,references=labels,average='macro')['precision']
    
    # convert predictions from class (0, 1) to label (Negative, Positive)
    prediction_labels = [tokenized_datasets['test'].features['labels'].int2str(x.item()) for x in predictions]
    
    # log predictions
    validation_logger.log_predictions(prediction_labels)

    return {
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision
    }

In [17]:
def get_trainer(model, output_dir, tokenizer, data_collator, training_args, train, test):
    "Prepare the hf Trainer"
    training_args = TrainingArguments(
        output_dir=output_dir,
        **training_args
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train,
        eval_dataset=test,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    return trainer

Let's log predictions at the end of each epoch

In [18]:
def train(train_args=default_training_args):
    trainer = get_trainer(
    output_dir=f'training_dir',
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    training_args=train_args,
    train=tokenized_datasets['train'],
    test=tokenized_datasets["test"])
    
    trainer.train()

In [19]:
train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text.
***** Running training *****
  Num examples = 5212
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 163
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Recall,F1,Accuracy,Precision
1,0.4569,0.502601,0.733078,0.735254,0.753022,0.738051


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text.
***** Running Evaluation *****
  Num examples = 579
  Batch size = 32
Saving model checkpoint to training_dir/checkpoint-163
Configuration saved in training_dir/checkpoint-163/config.json
Model weights saved in training_dir/checkpoint-163/pytorch_model.bin
tokenizer config file saved in training_dir/checkpoint-163/tokenizer_config.json
Special tokens file saved in training_dir/checkpoint-163/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [20]:
wandb.finish()

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████

0,1
eval/accuracy,0.75302
eval/f1,0.73525
eval/loss,0.5026
eval/precision,0.73805
eval/recall,0.73308
eval/runtime,3.3019
eval/samples_per_second,175.353
eval/steps_per_second,5.754
train/epoch,1.0
train/global_step,163.0


## Sweeps

In [24]:
import math

In [32]:
sweep_config = {
    'method': 'bayes'
    }

For `bayes`ian Sweeps,
you also need to tell us a bit about your `metric`.
We need to know its `name`, so we can find it in the model outputs
and we need to know whether your `goal` is to `minimize` it
(e.g. if it's the squared error)
or to `maximize` it
(e.g. if it's the accuracy).

In [33]:
metric = {
    'name': 'eval/loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

Once you've picked a `method` to try out new values of the hyperparameters,
you need to define what those `parameters` are.

Most of the time, this step is straightforward:
you just give the `parameter` a name
and specify a list of legal `values`
of the parameter.

For example, when we choose the `optimizer` for our network,
there's only a finite number of options.
Here we stick with the two most popular choices, `adam` and `sgd`.
Even for hyperparameters that have potentially infinite options,
it usually only makes sense to try out
a few select `values`,
as we do here with the hidden `layer_size` and `dropout`.

In [34]:
parameters_dict = {
    'learning_rate': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0,
        'max': 0.1
      },
    'batch_size': {
        # integers between 32 and 256
        # with evenly-distributed logarithms 
        'distribution': 'q_log_uniform',
        'q': 1,
        'min': math.log(4),
        'max': math.log(32),
      },
    'epochs': {
        "values": [4,6,8,10]
    }
}
sweep_config['parameters'] = parameters_dict

In [35]:
sweep_id = wandb.sweep(sweep_config, project="aws_demo")

Create sweep with ID: txa10kwk
Sweep URL: https://wandb.ai/capecape/aws_demo/sweeps/txa10kwk


In [36]:
sweep_config

{'method': 'bayes',
 'metric': {'name': 'eval/loss', 'goal': 'minimize'},
 'parameters': {'learning_rate': {'distribution': 'uniform',
   'min': 0,
   'max': 0.1},
  'batch_size': {'distribution': 'q_log_uniform',
   'q': 1,
   'min': 1.3862943611198906,
   'max': 3.4657359027997265},
  'epochs': {'values': [4, 6, 8, 10]}}}

In [37]:
def train_sweep(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        
        default_training_args["learning_rate"] = config.learning_rate
        default_training_args['per_device_train_batch_size'] = config.batch_size
        default_training_args['per_device_eval_batch_size'] = config.batch_size
        default_training_args["num_train_epochs"] = config.epochs
        
        train(default_training_args)

In [38]:
wandb.agent(sweep_id, train_sweep, count=5)

[34m[1mwandb[0m: Agent Starting Run: jd81cg58 with config:
[34m[1mwandb[0m: 	batch_size: 14
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.06550276443957684
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


PyTorch: setting up devices
Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text.
***** Running training *****
  Num examples = 5212
  Num Epochs = 8
  Instantaneous batch size per device = 14
  Total train batch size (w. parallel, distributed & accumulation) = 14
  Gradient Accumulation steps = 1
  Total optimization steps = 2984
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Recall,F1,Accuracy,Precision
1,0.0,,0.5,0.275344,0.379965,0.189983


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text.
***** Running Evaluation *****
  Num examples = 579
  Batch size = 14
Saving model checkpoint to training_dir/checkpoint-373
Configuration saved in training_dir/checkpoint-373/config.json
Model weights saved in training_dir/checkpoint-373/pytorch_model.bin
tokenizer config file saved in training_dir/checkpoint-373/tokenizer_config.json
Special tokens file saved in training_dir/checkpoint-373/special_tokens_map.json
Deleting older checkpoint [training_dir/checkpoint-652] due to args.save_total_limit
