In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
# from huggingface_hub import notebook_login




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "mistralai/Mistral-7B-v0.1" #bn22/Mistral-7B-Instruct-v0.1-sharded


In [3]:
dataset_name, new_model = "gathnex/Gath_baize", "LEO_mistral_7b"

In [4]:
# Loading a Gath_baize dataset
dataset = load_dataset(dataset_name, split="train")
dataset["chat_sample"][0]

'The conversation between Human and AI assisatance named Gathnex [INST] Generate a headline given a content block.\nThe Sony Playstation 5 is the latest version of the console. It has improved graphics and faster processing power.\n[/INST] Experience Amazing Graphics and Speed with the New Sony Playstation 5'

In [5]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)


In [6]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.28s/it]


In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.padding_side = 'right'
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [8]:
wandb.login(key = "3ad71fc847a5ed11d5647b0e8bf03d499f60e94a")
run = wandb.init(
    # set the wandb project where this run will be logged
    project="Local Mistral7B finetuning",
    job_type="training", 
    anonymous="allow",    
    # track hyperparameters and run metadata
    config={
    "learning_rate": 2e-4,
    "architecture": "LLM",
    "dataset": "gathnex/Gath_baize",
    "epochs": 1,
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mtisljaricleo[0m ([33mllm-team-321[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/tisljaricleo/.netrc


In [9]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

model = get_peft_model(model, peft_config)

In [10]:
# Training Arguments
# Hyperparameters should beadjusted based on the hardware you using
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 1,
    per_device_train_batch_size= 4,
    auto_find_batch_size =True,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 5000,
    logging_steps= 30,
    learning_rate= 5e-5,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb",
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="chat_sample",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



In [11]:
trainer.train()
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
model.config.use_cache = True
model.eval()



Step,Training Loss
30,0.8656
60,0.7686
90,0.7579
120,0.8175
150,0.84
180,1.0886
210,0.9378
240,0.7502
270,0.7413
300,0.7313


Checkpoint destination directory ./results/checkpoint-5000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-10000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /mistralai/Mistral-7B-v0.1/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fbe83535690>: Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 92218f76-d726-4b25-bca2-3356d0571592)')

In [None]:
wandb.finish()

# OLD 

In [None]:
!which python

wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wand

In [4]:
import os

In [5]:
# os.environ["CUDA_VISIBLE_DEVICES"] = 0

In [6]:
# Imports
import numpy as np
from datasets import load_dataset, load_metric
from tqdm import tqdm
import torch
import re

from transformers import (
    Trainer,
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer,
TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

True
1
0
<torch.cuda.device object at 0x7f656a8e1060>
NVIDIA GeForce RTX 2080 SUPER


In [8]:
# Settings
# model_checkpoint = "bert-base-uncased"
model_checkpoint = "distilbert-base-uncased"
# max_length = 128  # For the tokenizer.

# Data preparation

In [24]:
dataset = load_dataset("imdb", trust_remote_code=True)

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [26]:
# split_datasets = dataset["test"].train_test_split(train_size=0.7, seed=20)

In [27]:
# split_datasets

In [28]:
# dataset["test"] = split_datasets["train"]
# dataset["validation"] = split_datasets["test"]
dataset.pop("unsupervised")

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [30]:
dataset["train"][1]

{'text': '"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, 

In [31]:
dataset["train"][225]

 'label': 0}

In [32]:
for i in range(0, 3):
    print(dataset["test"][i])

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [33]:
cleared_text = re.sub('<[^<]+?>', '', dataset["train"][1]["text"])
cleared_text

'"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the s

In [34]:
def clear_text(text: str) -> str:
    return re.sub('<[^<]+?>', '', text)

# Tokenizer

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [36]:
i = 0

def preprocess_function(examples):    
    inputs = [clear_text(ex) for ex in examples["text"]]
    
    model_inputs = tokenizer(
        inputs, truncation=True
    )

    # model_inputs["labels"] = examples["label"]
    
    return model_inputs

In [37]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:03<00:00, 6589.37 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:03<00:00, 6582.32 examples/s]


In [38]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})

# Base model

In [39]:
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) 

In [40]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [41]:
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [42]:
# batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
# batch.keys()

In [43]:
# batch["labels"]

In [44]:
# tokenized_datasets["train"]

In [45]:
# tokenized_datasets["validation"]

In [46]:
# def postprocess_text(preds: list, labels: list) -> tuple:
#     """Performs post processing on the prediction text and labels"""

#     preds = [pred.strip() for pred in preds]
#     labels = [[label.strip()] for label in labels]

#     return preds, labels


# def compute_metrics(eval_preds: tuple) -> dict:
#     """computes bleu score and other performance metrics """

#     metric = load_metric("sacrebleu")
#     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#     preds, labels = eval_preds

#     if isinstance(preds, tuple):
#         preds = preds[0]

#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Some simple post-processing
#     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

#     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
#     result = {"bleu": result["score"]}

#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

#     result["gen_len"] = np.mean(prediction_lens)
#     result = {k: round(v, 4) for k, v in result.items()}

#     return result

In [47]:
# !pip install evaluate

In [48]:
# MY OLD

# training_args = TrainingArguments(
#     f"{model_checkpoint}-finetuned-CLASSIFICATION-0",
#     evaluation_strategy="epoch",
#     learning_rate=2e-4,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     weight_decay=0.02,
#     save_total_limit=3,
#     num_train_epochs=3,
# )
# trainer = Trainer(
#     model,
#     training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
# )

In [49]:
import numpy as np

import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [50]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer




# model = AutoModelForSequenceClassification.from_pretrained(
#     "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
# )

In [52]:
model_checkpoint

'distilbert-base-uncased'

In [53]:
finetuned_ids = [0, 1, 2, 3, 4]
learning_rates = [2e-3, 2e-4, 2e-5, 2e-7, 2e-9]
n_epochs = [2, 3, 3, 5, 7]

In [54]:
for f_id, lr, n_e in zip(finetuned_ids, learning_rates, n_epochs):
    
    print(f"ID: {f_id}, LEARNING_RATE: {lr}, N_EPOCHS: {n_e}")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
    )    

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments(
        output_dir=f"{model_checkpoint}-finetuned-CLASSIFICATION-{f_id}",
        learning_rate=lr,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=n_e,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        # push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()

ID: 0, LEARNING_RATE: 0.002, N_EPOCHS: 2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6934,0.693216,0.5
2,0.6932,0.693148,0.5


Checkpoint destination directory distilbert-base-uncased-finetuned-CLASSIFICATION-0/checkpoint-3125 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-finetuned-CLASSIFICATION-0/checkpoint-6250 already exists and is non-empty.Saving will proceed but saved results may be invalid.


ID: 1, LEARNING_RATE: 0.0002, N_EPOCHS: 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6961,0.693902,0.5
2,0.6937,0.694379,0.5
3,0.6941,0.69315,0.5


Checkpoint destination directory distilbert-base-uncased-finetuned-CLASSIFICATION-1/checkpoint-3125 already exists and is non-empty.Saving will proceed but saved results may be invalid.


ID: 2, LEARNING_RATE: 2e-05, N_EPOCHS: 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2785,0.233552,0.92308
2,0.184,0.278044,0.9302
3,0.0941,0.328357,0.93416


ID: 3, LEARNING_RATE: 2e-07, N_EPOCHS: 5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4862,0.428747,0.86512
2,0.3355,0.304453,0.88096
3,0.3101,0.286176,0.88628
4,0.3,0.283234,0.88808
5,0.2777,0.282083,0.88992


ID: 4, LEARNING_RATE: 2e-09, N_EPOCHS: 7


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6949,0.694164,0.49728
2,0.6943,0.694039,0.4988
3,0.6941,0.693954,0.49984
4,0.6941,0.6939,0.50004
5,0.6959,0.693871,0.5002
6,0.6952,0.693861,0.50024
7,0.6939,0.693859,0.50024


In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_checkpoint}-finetuned-CLASSIFICATION-0",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.704,0.700318
2,0.7043,0.693656
3,0.6975,0.693224


TrainOutput(global_step=9375, training_loss=0.7043526139322916, metrics={'train_runtime': 4295.8673, 'train_samples_per_second': 17.459, 'train_steps_per_second': 2.182, 'total_flos': 1.9733329152e+16, 'train_loss': 0.7043526139322916, 'epoch': 3.0})

# Using the model

In [78]:
text = "This was great movie!"

In [79]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

my_checkpoint = "distilbert-base-uncased-finetuned-CLASSIFICATION-2/checkpoint-9375"

tokenizer = AutoTokenizer.from_pretrained(my_checkpoint)
inputs = tokenizer(text, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained(my_checkpoint)
with torch.no_grad():
    logits = model(**inputs).logits

In [80]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

In [71]:
text

"This movie was so unrelentingly bad, I could hardly believe I was watching it. The directing, editing, production, and script all seemed as though they had been done by junior high school students who don't know all that much about movies. There was no narrative flow that made any sort of sense. Big emotional moments and climaxes (like one early on between Heath Ledger and Naomi Watts) and character relationships (like one hinted at at the very beginning) come completely out of no where and are not set up like they would have been in a more elegantly and effectively made film. The characters are sadly underdeveloped, making it difficult for us to have any sort of connection with them. The acting, surprisingly, is not entirely bad, but the terrible writing cancels out the relatively convincing performances. The film plays like a particularly bad T.V. western/epic, and sadly diminishes the fascinating (true) story that it attempts to tell. I have read a lot of reviews that defend the fi

In [67]:
dataset["train"][56]

{'text': "This movie was so unrelentingly bad, I could hardly believe I was watching it. The directing, editing, production, and script all seemed as though they had been done by junior high school students who don't know all that much about movies. There was no narrative flow that made any sort of sense. Big emotional moments and climaxes (like one early on between Heath Ledger and Naomi Watts) and character relationships (like one hinted at at the very beginning) come completely out of no where and are not set up like they would have been in a more elegantly and effectively made film. The characters are sadly underdeveloped, making it difficult for us to have any sort of connection with them. The acting, surprisingly, is not entirely bad, but the terrible writing cancels out the relatively convincing performances. The film plays like a particularly bad T.V. western/epic, and sadly diminishes the fascinating (true) story that it attempts to tell. I have read a lot of reviews that defe

In [32]:
from transformers import BertLMHeadModel

In [33]:
my_model_checkpoint = "bert-base-uncased-finetuned-CLASSIFICATION-0/checkpoint-9000"

my_model = BertLMHeadModel.from_pretrained(my_model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(my_model_checkpoint, return_tensors="pt")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased-finetuned-CLASSIFICATION-0/checkpoint-9000 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
my_model(["Hi!"])

TypeError: list indices must be integers or slices, not tuple

In [38]:
text = "Hello, my name is Leo!"
tokenized_text = tokenizer(text, return_tensors="pt")
result = my_model.generate(**tokenized_text)
tokenizer.decode(result[0], skip_special_tokens=True)

'hello, my name is leo!ellingellingellingellingellingellingellingellingellingellingelling'

## Using the model on a larger corpus

In [49]:
test_sentences = [
    "Lions are known as the kings of the jungle due to their majestic appearance.",
    "Elephants are the largest land mammals on Earth, known for their long trunks and big ears.",
    "Dolphins are highly intelligent marine mammals that often display playful behavior.",
    "Kangaroos are marsupials native to Australia and are known for their powerful hind legs and pouches.",
    "Penguins are flightless birds that spend most of their lives in the water and are excellent swimmers.",
    "Giraffes have long necks that allow them to reach high leaves in trees, making them the tallest animals on land.",
    "Butterflies undergo a remarkable transformation from caterpillars to beautiful, colorful insects.",
    "Cheetahs are the fastest land animals, capable of reaching speeds up to 60 miles per hour.",
    "Whales are the largest animals on Earth, with some species growing to over 100 feet in length.",
    "Honeybees play a vital role in pollinating plants and are known for their complex hive structures."
]    

In [129]:
all_translations = []

for sentence in tqdm(test_sentences):
    tokenized_sentence = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    raw_translation = my_model.generate(**tokenized_sentence)
    translation = tokenizer.decode(raw_translation[0], skip_special_tokens=True)
    all_translations.append(translation.replace("« & #160;", "").replace("& #160; »", "").replace("& #160;", "").replace("  ", " "))

all_translations

100%|████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.93it/s]


["Les Lunes sont connues comme étant la valeur de l'effet de leur présence d'accolade.",
 'Les élements sont les plus grands moment de vie sur la Terre, connus pour leurs longues pauses et sa grande myrinthe.',
 'Les Dolphins sont un comportement très intelligent et on peut y voir souvent le comportement playable.',
 "Kangaroos est un marasicien natif à l'Australie et sont connus pour leur puissant hen-tête hen-hung and yes.",
 "Les Penguins sont des volumineux qui passent de la plupart de leurs vies dans l'eau et sont d'excellentes averses.",
 "Les combrés de Giraffe ont des colonnes longues qui permettent d'atteindre les grands sauts d'arborescence, en leur rendant les plus grands utility sur les immeubles.",
 "L' grâce à des franges qui s'évaluent d'une transformation de l'épingle de lune enroulée pour rendre agréables, colorées. Name_BAR_plasma contain white spaces contain white spaces and non latin1 characters.",
 "Cheetahs sont les YSTAbout Devices, capable d'atteindre des vitess

## Using the model on a sentences batch

In [139]:
tokenized_batch = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
translated_batch = my_model.generate(**tokenized_batch)
tokenizer.batch_decode(translated_batch, skip_special_tokens=True)

["Les Lunes sont connues comme étant la valeur de l'effet de leur présence d'accolade.",
 'Les élements sont les plus grands moment de vie sur la Terre, connus pour leurs longues pauses et sa grande myrinthe.',
 'Les Dolphins sont un comportement très intelligent et on peut y voir souvent le comportement playable.',
 "Kangaroos est un marasicien natif à l'Australie et sont connus pour leur puissant hen-tête hen-hung and yes.",
 "Les Penguins sont des volumineux qui passent de la plupart de leurs vies dans l'eau et sont d'excellentes averses.",
 "Les combrés de Giraffe ont des colonnes longues qui permettent d'atteindre les grands sauts d'arborescence, en leur rendant les plus grands utility sur les immeubles.",
 "L' grâce à des franges qui s'évaluent d'une transformation de l'épingle de lune enroulée pour rendre agréables, colorées. Name_BAR_plasma contain white spaces contain white spaces and non latin1 characters.",
 "Cheetahs sont les YSTAbout Devices, capable d'atteindre des vitess

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
classifier(text)