In [1]:
import os

# select your GPU. Note that this should be set before you load tensorflow or pytorch.
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'

# To use multiple GPUs, combine all GPU ID with commas
# e.g. >>> os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,3'

In [2]:
import torch
# Check if any GPU is used
torch.cuda.is_available()

True

In [3]:
# the model you want to use. Available models can be found here: https://huggingface.co/models
MODEL_NAME = 'distilbert-base-uncased'

In [115]:
import os
from datasets import load_dataset

In [197]:
dataset = load_dataset('csv', data_files = os.path.join('data', 'partytrain.csv'))
print(dataset)
print(dataset['train'][0])

Using custom data configuration default-073128ab64a99937


Downloading and preparing dataset csv/default to /home/xiangyu/.cache/huggingface/datasets/csv/default-073128ab64a99937/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/xiangyu/.cache/huggingface/datasets/csv/default-073128ab64a99937/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'sense'],
        num_rows: 50
    })
})
{'text': 'a naked [MASK], also known as nude [MASK], is a [MASK] where the participants are required to be nude.', 'sense': 1}


In [198]:
from transformers import AutoTokenizer # For tokenization

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/xiangyu/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.14.1",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /home/xiangyu/.cache/huggingface/transformers/0e1bbfda7f63a

In [199]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# First, declare a new encoder
encoder = OneHotEncoder(sparse = False)
# Then, let the encoder learns all features in the given dataset
# Keep in mind that all `fit` functions in sklearn only make the encoder learn from the data, not transforming the data yet.
encoder = encoder.fit(np.reshape(dataset['train']['sense'], (-1, 1)))

In [200]:
LABEL_COUNT = len(encoder.categories_[0])
print(LABEL_COUNT)

3


In [201]:
print(encoder.categories_)

[array([1, 2, 3])]


In [202]:
def preprocess(dataslice):
    """ Input: a batch of your dataset
        Example: { 'text': [['sentence1'], ['setence2'], ...],
                   'sense': ['label1', 'label2', ...] }
    """
    
    # [ TODO ]
    sense = [[i] for i in dataslice["sense"]]
    label = encoder.transform(sense)
    output = tokenizer(dataslice["text"])
    output["labels"] = label
    
    
    
    return output
    """ Output: a batch of processed dataset
        Example: { 'input_ids': ...,
                   'attention_masks': ...,
                   'label': ... }
    """

In [203]:
processed_data = dataset.map(preprocess,    # your processing function
                             batched = True # Process in batches so it can be faster
                            )

  0%|          | 0/1 [00:00<?, ?ba/s]

In [204]:
print(processed_data)
processed_data['train'][0]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'sense', 'text'],
        num_rows: 50
    })
})


{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'input_ids': [101,
  1037,
  6248,
  103,
  1010,
  2036,
  2124,
  2004,
  15287,
  103,
  1010,
  2003,
  1037,
  103,
  2073,
  1996,
  6818,
  2024,
  3223,
  2000,
  2022,
  15287,
  1012,
  102],
 'labels': [1.0, 0.0, 0.0],
 'sense': 1,
 'text': 'a naked [MASK], also known as nude [MASK], is a [MASK] where the participants are required to be nude.'}

In [205]:
from transformers import DataCollatorWithPadding

# declare a collator to do padding during traning.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [206]:
# Change to TFAutoModelForSequenceClassification if you're using tensoflow
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels = LABEL_COUNT)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/xiangyu/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.14.1",
  "vocab_size": 30522
}

lo

In [218]:
# [ TODO ] Choose the validation data size                                v here
train_val_dataset = processed_data['train'].train_test_split(test_size = 0.1)

In [219]:
# Take a look at split data
print(train_val_dataset)

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'sense', 'text'],
        num_rows: 45
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'sense', 'text'],
        num_rows: 5
    })
})


In [220]:
# Change to TFTrainingArguments, TFTrainer if you're using tensoflow
from transformers import TrainingArguments, Trainer

In [223]:
# [ TODO ] Set and tune your training properties
LEARNING_RATE = 1e-5
BATCH_SIZE = 8
EPOCH = 300
training_args = TrainingArguments(
    output_dir = 'model',
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs = EPOCH,
    # You can also set other parameters here
)

# Now give all information to a trainer.
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_val_dataset["train"],
    eval_dataset = train_val_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    # You can also set other parameters
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [224]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, sense.
***** Running training *****
  Num examples = 45
  Num Epochs = 300
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss
500,0.0753


Saving model checkpoint to model/checkpoint-500
Configuration saved in model/checkpoint-500/config.json
Model weights saved in model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in model/checkpoint-500/tokenizer_config.json
Special tokens file saved in model/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=600, training_loss=0.06620031992594401, metrics={'train_runtime': 93.2946, 'train_samples_per_second': 144.703, 'train_steps_per_second': 6.431, 'total_flos': 252878823004770.0, 'train_loss': 0.06620031992594401, 'epoch': 300.0})

In [225]:
model.save_pretrained(os.path.join('model', 'finetuned'))

Configuration saved in model/finetuned/config.json
Model weights saved in model/finetuned/pytorch_model.bin


In [226]:
# Same, change to TFxxxxxx if you are using tensorflow
from transformers import AutoModelForSequenceClassification

mymodel = AutoModelForSequenceClassification.from_pretrained(os.path.join('model', 'finetuned'))

loading configuration file model/finetuned/config.json
Model config DistilBertConfig {
  "_name_or_path": "model/finetuned",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "multi_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.14.1",
  "vocab_size": 30522
}

loading weights file model/finetuned/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All 

In [227]:
examples = [
    # 1
    "A naked [MASK], also known as nude [MASK], is a [MASK] where the participants are required to be nude.",
    # 2
    "he represents venstre, a danish centre-right [MASK].",
    # 3
    "in law, an allegation is a claim of a fact by a [MASK] in a pleading, charge, or defense.",
]

In [228]:
# Transform the sentences into embeddings
input = tokenizer(examples, truncation=True, padding=True, return_tensors="pt") # change return_tensors if youre using tensorflow
# Get the output
logits = mymodel(**input).logits
logits


tensor([[ 3.9308, -4.0287, -3.9494],
        [-4.4017,  4.2521, -4.2589],
        [-2.8793, -3.0254,  3.0018]], grad_fn=<AddmmBackward>)

In [229]:
# Or `from tensorflow import nn` and `nn.softmax`
from torch import nn

predicts = nn.functional.softmax(logits, dim = -1)
predicts

tensor([[9.9927e-01, 3.4908e-04, 3.7786e-04],
        [1.7439e-04, 9.9962e-01, 2.0116e-04],
        [2.7774e-03, 2.3999e-03, 9.9482e-01]], grad_fn=<SoftmaxBackward>)

In [230]:
def logits2labels(predicts):
    _, labels = predicts.max(dim=1)
    onehot = nn.functional.one_hot(labels, LABEL_COUNT)
    return encoder.inverse_transform(onehot) 

In [232]:
testset = load_dataset('csv', data_files = os.path.join('data', 'partytest.csv'))

# preprocess
input = tokenizer(testset["train"]["text"], truncation=True, padding=True, return_tensors="pt")
# get predictions
logits = mymodel(**input).logits
predicts = nn.functional.softmax(logits, dim = -1)


Using custom data configuration default-a7bfde7663847af0


Downloading and preparing dataset csv/default to /home/xiangyu/.cache/huggingface/datasets/csv/default-a7bfde7663847af0/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/xiangyu/.cache/huggingface/datasets/csv/default-a7bfde7663847af0/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [233]:
predict_label = logits2labels(predicts)

In [234]:
total = 0
count = 0
for P,G in zip(predict_label,  testset["train"]["sense"]):
    if P == G:
        count += 1
    total += 1

In [235]:
count/total

0.893455098934551