In [1]:
from datasets import load_dataset
dataset = load_dataset('imdb')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
# the following is used to tokenizing the data
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples['text'],truncation=True)

tokenized_datasets = dataset.map(tokenize_function,batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# the following is used to reduce the size of the dataset
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))

In [5]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="bert-base-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Hints:** The following two lines told us that we have to reduce the number of words to reduce the number of input_ids. This way, the model can process this input. 

In [None]:
classifier(' '.join(small_eval_dataset['text'][14].split()[:500]))

In [132]:
# when selecting the first 350 words, the number of corresponding input_ids will less than the limitation of the pre-trained model and thus the model can make predictions for it.
print(len(tokenizer(' '.join(small_eval_dataset['text'][14].split()[:300]))['input_ids']))

444


In [133]:
classifier(' '.join(small_eval_dataset['text'][14].split()[:300]))

[{'label': 'LABEL_1', 'score': 0.5735170841217041}]

**Hint:** The follwoing can show that after the tokenizing, the number of id is larger than the number of words, which means even though the number of words in a sentence is less than 512, the number of corresponding tokens may larger than 512(the limitation of the input for the model.)

In [134]:
input_ids=tokenizer(' '.join(small_eval_dataset['text'][100].split()))['input_ids']
tokens=tokenizer.convert_ids_to_tokens(input_ids)

In [135]:
words=small_eval_dataset['text'][100].split()

In [136]:
print(len(words))
print(len(input_ids))

123
167


In [None]:
tokens

In [None]:
words

In [44]:
from sklearn.metrics import classification_report

def metrics_report(classifier,data):
    labels_gold=[]
    labels_pre=[]
    
    for i in range(len(data)):
        text = data['text'][i]
        text_truncated = ' '.join(text.split()[:250])
        pre = classifier(text_truncated)[0]['label']
        pre = 0 if pre =='LABEL_0' else 1
        labels_pre.append(pre)
        
        gold = data['label'][i]
        labels_gold.append(gold)
        
            
    
    print(classification_report(labels_gold,labels_pre))
    
    

In [45]:
metrics_report(classifier,small_eval_dataset)

              precision    recall  f1-score   support

           0       0.51      1.00      0.68       512
           1       0.00      0.00      0.00       488

    accuracy                           0.51      1000
   macro avg       0.26      0.50      0.34      1000
weighted avg       0.26      0.51      0.35      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Training Part

In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# the following is used to fine-tune the BERT model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# this is the metrics part
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="bert-base-cased_for_sentiment_analysis", evaluation_strategy="epoch")

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

  0%|          | 0/375 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.36987385153770447, 'eval_accuracy': 0.847, 'eval_runtime': 559.6459, 'eval_samples_per_second': 1.787, 'eval_steps_per_second': 0.223, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.47788000106811523, 'eval_accuracy': 0.859, 'eval_runtime': 559.3847, 'eval_samples_per_second': 1.788, 'eval_steps_per_second': 0.223, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.6286229491233826, 'eval_accuracy': 0.874, 'eval_runtime': 591.306, 'eval_samples_per_second': 1.691, 'eval_steps_per_second': 0.211, 'epoch': 3.0}
{'train_runtime': 6088.5894, 'train_samples_per_second': 0.493, 'train_steps_per_second': 0.062, 'train_loss': 0.3095181681315104, 'epoch': 3.0}


TrainOutput(global_step=375, training_loss=0.3095181681315104, metrics={'train_runtime': 6088.5894, 'train_samples_per_second': 0.493, 'train_steps_per_second': 0.062, 'train_loss': 0.3095181681315104, 'epoch': 3.0})

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/jinya425/bert-base-cased_for_sentiment_analysis/tree/main/'

In [14]:
from transformers import pipeline
classifier_2 = pipeline("sentiment-analysis", model="jinya425/bert-base-cased_for_sentiment_analysis")

Downloading config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [46]:
metrics_report(classifier_2,small_eval_dataset)

              precision    recall  f1-score   support

           0       0.88      0.85      0.87       512
           1       0.85      0.88      0.87       488

    accuracy                           0.87      1000
   macro avg       0.87      0.87      0.87      1000
weighted avg       0.87      0.87      0.87      1000



In [43]:
classifier(small_eval_dataset['text'][6])[0]['label']

'LABEL_0'