# Arabic Fine Tuning Test
- dataset extracted from: [Twitter Corpus](https://www.kaggle.com/datasets/mksaad/arabic-sentiment-twitter-corpus)
- Cleaning logic inspired from [arabic-sentiment-twitter-corpus](https://www.kaggle.com/datasets/mksaad/arabic-sentiment-twitter-corpus)

## Load datasets

In [None]:
! pip install numpy
! pip install pandas
! pip install transformers
! pip install peft
! pip install torch
! pip install evaluate
! pip install emoji
! pip install scikit-learn

In [43]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
import csv
import unicodedata
import re
import emoji

In [44]:
# Load training subset
train_neg = pd.read_csv("dataset/train_Arabic_tweets_negative.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
train_neg.rename(columns={0:'label', 1:'tweet'}, inplace=True)
train_neg['label'] = 0

train_pos = pd.read_csv("dataset/train_Arabic_tweets_positive.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
train_pos.rename(columns={0:'label', 1:'tweet'}, inplace=True)
train_pos['label'] = 1

train_df = pd.concat([train_neg, train_pos], axis=0).reset_index(drop=True)

# Load test subset
test_pos = pd.read_csv("dataset/test_Arabic_tweets_positive.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
test_pos.rename(columns={0:'label', 1:'tweet'}, inplace=True)
test_pos['label']=1

test_neg = pd.read_csv("dataset/test_Arabic_tweets_negative.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
test_neg.rename(columns={0:'label', 1:'tweet'}, inplace=True)
test_neg['label']=0

test_df = pd.concat([test_neg, test_pos], axis=0).reset_index(drop=True)

In [45]:
# create new dataset
dataset = DatasetDict({'train':Dataset.from_dict({'label':train_df['label'],'tweet':train_df['tweet']}),
                            'test':Dataset.from_dict({'label':test_df['label'],'tweet':test_df['tweet']})})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'tweet'],
        num_rows: 47000
    })
    test: Dataset({
        features: ['label', 'tweet'],
        num_rows: 11751
    })
})

In [46]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5080638297872341

## Model
- https://huggingface.co/arabi-elidrisi/ArabicDistilBERT

In [47]:
model_checkpoint = 'asafaya/bert-mini-arabic'

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mini-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# display architecture
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, e

## preprocess data

In [49]:
def remove_emojis(sent):
    text =  emoji.demojize(sent)
    text= re.sub(r'(:[!_\-\w]+:)', '', text)
    return text

def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """

    # Normalize unicode encoding
    text = unicodedata.normalize('NFC', text)
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    #Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '<URL>', text)
    
    # Remove emojis
    text = remove_emojis(text)

    return text

# dataset['train']["tweet"] = [text_preprocessing(text) for text in dataset['train']["tweet"]]
# dataset['test']["tweet"] = [text_preprocessing(text) for text in dataset['test']["tweet"]]

In [50]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# create tokenize function
def tokenize_df(df):
    # extract text
    texts = [text_preprocessing(text) for text in df["tweet"]]

    #tokenize and truncate text
    tokenizer.truncation_side = "right"
    tokenized_inputs = tokenizer(
        texts,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_df, batched=True)

# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 47000/47000 [00:03<00:00, 11828.94 examples/s]
Map: 100%|██████████| 11751/11751 [00:00<00:00, 14178.71 examples/s]


In [51]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'tweet', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 47000
    })
    test: Dataset({
        features: ['label', 'tweet', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11751
    })
})

## evaluation

In [52]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## Apply untrained model to text

In [53]:
# define list of examples
text_list = ["هذا رائع", "ليس جيداً بالمطلق", "يمكن تحسين الوضع.", "لنتأمل أن يتحسن", "لا أعرف بالضبط"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
هذا رائع - Negative
ليس جيداً بالمطلق - Negative
يمكن تحسين الوضع. - Negative
لنتأمل أن يتحسن - Negative
لا أعرف بالضبط - Negative


## Train model

In [54]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query'])

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 8,706 || all params: 11,558,148 || trainable%: 0.07532348608098806


In [55]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [56]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [58]:
# create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

 16%|█▌        | 18216/117500 [19:39:32<107:08:58,  3.89s/it]
  0%|          | 499/117500 [00:18<1:08:55, 28.29it/s]
  0%|          | 505/117500 [00:18<1:10:06, 27.81it/s]  

{'loss': 0.6469, 'learning_rate': 0.000995744680851064, 'epoch': 0.04}


  1%|          | 1000/117500 [00:36<1:09:19, 28.01it/s]
  1%|          | 1003/117500 [00:36<1:10:13, 27.65it/s] 

{'loss': 0.6802, 'learning_rate': 0.0009914893617021276, 'epoch': 0.09}


  1%|▏         | 1498/117500 [00:53<1:09:45, 27.71it/s]
  1%|▏         | 1504/117500 [00:54<1:11:25, 27.07it/s] 

{'loss': 0.6475, 'learning_rate': 0.0009872340425531915, 'epoch': 0.13}


  2%|▏         | 1999/117500 [01:11<1:07:39, 28.45it/s]
  2%|▏         | 2005/117500 [01:12<1:09:12, 27.82it/s] 

{'loss': 0.6508, 'learning_rate': 0.0009829787234042554, 'epoch': 0.17}


  2%|▏         | 2500/117500 [01:29<1:10:09, 27.32it/s]
  2%|▏         | 2503/117500 [01:29<1:11:23, 26.84it/s] 

{'loss': 0.6683, 'learning_rate': 0.0009787234042553192, 'epoch': 0.21}


  3%|▎         | 2998/117500 [01:47<1:06:31, 28.69it/s]
  3%|▎         | 3004/117500 [01:47<1:08:13, 27.97it/s] 

{'loss': 0.648, 'learning_rate': 0.0009744680851063829, 'epoch': 0.26}


  3%|▎         | 3499/117500 [02:05<1:17:38, 24.47it/s]
  3%|▎         | 3502/117500 [02:06<1:20:11, 23.69it/s] 

{'loss': 0.6616, 'learning_rate': 0.0009702127659574468, 'epoch': 0.3}


  3%|▎         | 4000/117500 [02:24<1:11:27, 26.47it/s]
  3%|▎         | 4003/117500 [02:24<1:13:59, 25.57it/s] 

{'loss': 0.6679, 'learning_rate': 0.0009659574468085106, 'epoch': 0.34}


  4%|▍         | 4498/117500 [02:42<1:09:27, 27.12it/s]
  4%|▍         | 4504/117500 [02:42<1:11:15, 26.43it/s] 

{'loss': 0.6375, 'learning_rate': 0.0009617021276595745, 'epoch': 0.38}


  4%|▍         | 4999/117500 [03:00<1:05:15, 28.74it/s]
  4%|▍         | 5005/117500 [03:01<1:06:19, 28.27it/s] 

{'loss': 0.6666, 'learning_rate': 0.0009574468085106384, 'epoch': 0.43}


  5%|▍         | 5500/117500 [03:19<1:10:38, 26.43it/s]
  5%|▍         | 5503/117500 [03:19<1:12:37, 25.70it/s] 

{'loss': 0.6561, 'learning_rate': 0.0009531914893617022, 'epoch': 0.47}


  5%|▌         | 5998/117500 [03:38<1:05:39, 28.30it/s]
  5%|▌         | 6004/117500 [03:38<1:07:58, 27.34it/s] 

{'loss': 0.65, 'learning_rate': 0.000948936170212766, 'epoch': 0.51}


  6%|▌         | 6499/117500 [03:56<1:09:39, 26.56it/s]
  6%|▌         | 6505/117500 [03:56<1:10:42, 26.16it/s] 

{'loss': 0.6673, 'learning_rate': 0.0009446808510638298, 'epoch': 0.55}


  6%|▌         | 7000/117500 [04:14<1:05:25, 28.15it/s]
  6%|▌         | 7003/117500 [04:15<1:07:52, 27.13it/s] 

{'loss': 0.651, 'learning_rate': 0.0009404255319148937, 'epoch': 0.6}


  6%|▋         | 7498/117500 [04:32<1:04:53, 28.25it/s]
  6%|▋         | 7504/117500 [04:32<1:05:39, 27.92it/s] 

{'loss': 0.6771, 'learning_rate': 0.0009361702127659575, 'epoch': 0.64}


  7%|▋         | 7999/117500 [04:50<1:04:50, 28.14it/s]
  7%|▋         | 8005/117500 [04:50<1:07:25, 27.07it/s] 

{'loss': 0.6611, 'learning_rate': 0.0009319148936170214, 'epoch': 0.68}


  7%|▋         | 8500/117500 [05:08<1:03:27, 28.62it/s]
  7%|▋         | 8503/117500 [05:08<1:08:36, 26.48it/s] 

{'loss': 0.6498, 'learning_rate': 0.0009276595744680851, 'epoch': 0.72}


  8%|▊         | 8998/117500 [05:26<1:02:46, 28.81it/s]
  8%|▊         | 9004/117500 [05:26<1:03:57, 28.27it/s] 

{'loss': 0.6524, 'learning_rate': 0.0009234042553191489, 'epoch': 0.77}


  8%|▊         | 9499/117500 [05:43<1:01:51, 29.10it/s]
  8%|▊         | 9505/117500 [05:44<1:05:18, 27.56it/s] 

{'loss': 0.6568, 'learning_rate': 0.0009191489361702128, 'epoch': 0.81}


  9%|▊         | 10000/117500 [06:01<1:01:48, 28.99it/s]
  9%|▊         | 10003/117500 [06:01<1:05:33, 27.33it/s]

{'loss': 0.6792, 'learning_rate': 0.0009148936170212766, 'epoch': 0.85}


  9%|▉         | 10498/117500 [06:19<1:07:07, 26.57it/s]
  9%|▉         | 10504/117500 [06:19<1:07:04, 26.58it/s]

{'loss': 0.678, 'learning_rate': 0.0009106382978723405, 'epoch': 0.89}


  9%|▉         | 10999/117500 [06:37<1:03:46, 27.84it/s]
  9%|▉         | 11005/117500 [06:37<1:04:37, 27.46it/s]

{'loss': 0.6953, 'learning_rate': 0.0009063829787234043, 'epoch': 0.94}


 10%|▉         | 11500/117500 [06:54<1:03:34, 27.79it/s]
 10%|▉         | 11503/117500 [06:55<1:06:33, 26.54it/s]

{'loss': 0.6586, 'learning_rate': 0.000902127659574468, 'epoch': 0.98}


 10%|▉         | 11749/117500 [07:03<1:03:16, 27.86it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 10%|█         | 11750/117500 [07:14<1:03:16, 27.86it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.6295496225357056, 'eval_accuracy': {'accuracy': 0.6555186792613394}, 'eval_runtime': 40.0048, 'eval_samples_per_second': 293.739, 'eval_steps_per_second': 73.441, 'epoch': 1.0}


 10%|█         | 12000/117500 [07:53<1:05:21, 26.90it/s]  
 10%|█         | 12003/117500 [07:53<1:05:57, 26.66it/s]

{'loss': 0.653, 'learning_rate': 0.0008978723404255319, 'epoch': 1.02}


 11%|█         | 12498/117500 [08:11<1:00:59, 28.70it/s]
 11%|█         | 12504/117500 [08:11<1:01:43, 28.35it/s]

{'loss': 0.6559, 'learning_rate': 0.0008936170212765957, 'epoch': 1.06}


 11%|█         | 12999/117500 [08:28<1:01:19, 28.40it/s]
 11%|█         | 13002/117500 [08:28<1:05:34, 26.56it/s]

{'loss': 0.6442, 'learning_rate': 0.0008893617021276596, 'epoch': 1.11}


 11%|█▏        | 13500/117500 [08:46<1:01:14, 28.30it/s]
 11%|█▏        | 13503/117500 [08:46<1:02:43, 27.63it/s]

{'loss': 0.6589, 'learning_rate': 0.0008851063829787234, 'epoch': 1.15}


 12%|█▏        | 13998/117500 [09:04<1:01:25, 28.08it/s]
 12%|█▏        | 14004/117500 [09:04<1:02:34, 27.57it/s]

{'loss': 0.6662, 'learning_rate': 0.0008808510638297873, 'epoch': 1.19}


 12%|█▏        | 14499/117500 [09:21<59:53, 28.66it/s]  
 12%|█▏        | 14505/117500 [09:22<1:00:50, 28.22it/s]

{'loss': 0.6777, 'learning_rate': 0.0008765957446808511, 'epoch': 1.23}


 13%|█▎        | 15000/117500 [09:40<59:39, 28.63it/s]  
 13%|█▎        | 15003/117500 [09:40<1:01:24, 27.82it/s]

{'loss': 0.6681, 'learning_rate': 0.0008723404255319149, 'epoch': 1.28}


 13%|█▎        | 15498/117500 [09:57<59:18, 28.67it/s]  
 13%|█▎        | 15504/117500 [09:58<1:00:28, 28.11it/s]

{'loss': 0.679, 'learning_rate': 0.0008680851063829788, 'epoch': 1.32}


 14%|█▎        | 15999/117500 [10:15<59:34, 28.40it/s]  
 14%|█▎        | 16005/117500 [10:16<1:01:13, 27.63it/s]

{'loss': 0.67, 'learning_rate': 0.0008638297872340426, 'epoch': 1.36}


 14%|█▍        | 16500/117500 [10:33<58:54, 28.57it/s]  
 14%|█▍        | 16503/117500 [10:33<59:25, 28.32it/s]  

{'loss': 0.6631, 'learning_rate': 0.0008595744680851064, 'epoch': 1.4}


 14%|█▍        | 16998/117500 [10:51<58:04, 28.84it/s]  
 14%|█▍        | 17001/117500 [10:51<1:00:27, 27.70it/s]

{'loss': 0.6844, 'learning_rate': 0.0008553191489361703, 'epoch': 1.45}


 15%|█▍        | 17499/117500 [11:09<57:45, 28.86it/s]  
 15%|█▍        | 17505/117500 [11:09<1:00:17, 27.64it/s]

{'loss': 0.6616, 'learning_rate': 0.000851063829787234, 'epoch': 1.49}


 15%|█▌        | 18000/117500 [11:26<57:42, 28.74it/s]  
 15%|█▌        | 18003/117500 [11:26<59:43, 27.77it/s]  

{'loss': 0.6824, 'learning_rate': 0.0008468085106382979, 'epoch': 1.53}


 16%|█▌        | 18498/117500 [11:44<57:13, 28.84it/s]  
 16%|█▌        | 18504/117500 [11:44<58:55, 28.00it/s]  

{'loss': 0.6606, 'learning_rate': 0.0008425531914893617, 'epoch': 1.57}


 16%|█▌        | 18999/117500 [12:01<58:03, 28.27it/s]  
 16%|█▌        | 19005/117500 [12:02<58:02, 28.29it/s]  

{'loss': 0.679, 'learning_rate': 0.0008382978723404256, 'epoch': 1.62}


 17%|█▋        | 19500/117500 [12:19<56:55, 28.70it/s]  
 17%|█▋        | 19503/117500 [12:20<58:47, 27.78it/s]  

{'loss': 0.6607, 'learning_rate': 0.0008340425531914894, 'epoch': 1.66}


 17%|█▋        | 19998/117500 [12:37<56:10, 28.93it/s]  
 17%|█▋        | 20004/117500 [12:37<57:27, 28.28it/s]  

{'loss': 0.6717, 'learning_rate': 0.0008297872340425531, 'epoch': 1.7}


 17%|█▋        | 20499/117500 [12:55<56:51, 28.43it/s]  
 17%|█▋        | 20505/117500 [12:55<58:02, 27.85it/s]  

{'loss': 0.6517, 'learning_rate': 0.000825531914893617, 'epoch': 1.74}


 18%|█▊        | 21000/117500 [13:13<56:34, 28.43it/s]  
 18%|█▊        | 21003/117500 [13:13<58:16, 27.60it/s]  

{'loss': 0.6683, 'learning_rate': 0.0008212765957446808, 'epoch': 1.79}


 18%|█▊        | 21498/117500 [13:30<55:52, 28.64it/s]
 18%|█▊        | 21504/117500 [13:30<58:07, 27.53it/s]  

{'loss': 0.6684, 'learning_rate': 0.0008170212765957447, 'epoch': 1.83}


 19%|█▊        | 21999/117500 [13:48<55:51, 28.50it/s]  
 19%|█▊        | 22005/117500 [13:48<57:16, 27.79it/s]  

{'loss': 0.6642, 'learning_rate': 0.0008127659574468085, 'epoch': 1.87}


 19%|█▉        | 22500/117500 [14:06<1:01:01, 25.95it/s]
 19%|█▉        | 22503/117500 [14:06<1:01:59, 25.54it/s]

{'loss': 0.6711, 'learning_rate': 0.0008085106382978723, 'epoch': 1.91}


 20%|█▉        | 22998/117500 [14:24<54:43, 28.78it/s]  
 20%|█▉        | 23004/117500 [14:24<56:44, 27.75it/s]  

{'loss': 0.6786, 'learning_rate': 0.0008042553191489363, 'epoch': 1.96}


 20%|█▉        | 23499/117500 [14:42<54:21, 28.82it/s]  
 20%|██        | 23500/117500 [14:42<54:21, 28.82it/s]  

{'loss': 0.6821, 'learning_rate': 0.0008, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 20%|██        | 23500/117500 [14:54<54:21, 28.82it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.6262732744216919, 'eval_accuracy': {'accuracy': 0.6488809463024423}, 'eval_runtime': 38.1499, 'eval_samples_per_second': 308.022, 'eval_steps_per_second': 77.012, 'epoch': 2.0}


 20%|██        | 23999/117500 [15:38<54:38, 28.52it/s]    
 20%|██        | 24005/117500 [15:38<55:45, 27.94it/s]  

{'loss': 0.6656, 'learning_rate': 0.0007957446808510639, 'epoch': 2.04}


 21%|██        | 24500/117500 [15:56<55:06, 28.13it/s]  
 21%|██        | 24503/117500 [15:56<56:13, 27.56it/s]  

{'loss': 0.6611, 'learning_rate': 0.0007914893617021277, 'epoch': 2.09}


 21%|██▏       | 24998/117500 [16:14<55:42, 27.68it/s]  
 21%|██▏       | 25004/117500 [16:14<57:35, 26.76it/s]  

{'loss': 0.6665, 'learning_rate': 0.0007872340425531915, 'epoch': 2.13}


 22%|██▏       | 25499/117500 [16:31<53:25, 28.70it/s]  
 22%|██▏       | 25505/117500 [16:32<54:38, 28.06it/s]  

{'loss': 0.6865, 'learning_rate': 0.0007829787234042554, 'epoch': 2.17}


 22%|██▏       | 26000/117500 [16:49<53:39, 28.42it/s]  
 22%|██▏       | 26003/117500 [16:49<54:59, 27.73it/s]  

{'loss': 0.6613, 'learning_rate': 0.0007787234042553192, 'epoch': 2.21}


 23%|██▎       | 26498/117500 [17:07<59:05, 25.67it/s]  
 23%|██▎       | 26504/117500 [17:08<58:25, 25.96it/s]  

{'loss': 0.6784, 'learning_rate': 0.000774468085106383, 'epoch': 2.26}


 23%|██▎       | 26999/117500 [17:26<54:39, 27.60it/s]  
 23%|██▎       | 27005/117500 [17:26<55:11, 27.33it/s]  

{'loss': 0.6861, 'learning_rate': 0.0007702127659574468, 'epoch': 2.3}


 23%|██▎       | 27500/117500 [17:44<53:23, 28.09it/s]  
 23%|██▎       | 27503/117500 [17:44<54:06, 27.72it/s]  

{'loss': 0.6536, 'learning_rate': 0.0007659574468085106, 'epoch': 2.34}


 24%|██▍       | 27998/117500 [18:02<51:55, 28.73it/s]  
 24%|██▍       | 28004/117500 [18:02<53:50, 27.70it/s]  

{'loss': 0.668, 'learning_rate': 0.0007617021276595745, 'epoch': 2.38}


 24%|██▍       | 28499/117500 [18:20<51:43, 28.68it/s]  
 24%|██▍       | 28505/117500 [18:20<53:30, 27.72it/s]  

{'loss': 0.6767, 'learning_rate': 0.0007574468085106383, 'epoch': 2.43}


 25%|██▍       | 29000/117500 [18:38<52:11, 28.26it/s]  
 25%|██▍       | 29003/117500 [18:38<54:34, 27.03it/s]  

{'loss': 0.6666, 'learning_rate': 0.0007531914893617022, 'epoch': 2.47}


 25%|██▌       | 29498/117500 [18:56<54:09, 27.08it/s]  
 25%|██▌       | 29504/117500 [18:56<55:33, 26.40it/s]  

{'loss': 0.6621, 'learning_rate': 0.0007489361702127659, 'epoch': 2.51}


 26%|██▌       | 29999/117500 [19:14<51:08, 28.51it/s]  
 26%|██▌       | 30005/117500 [19:15<52:37, 27.71it/s]  

{'loss': 0.6722, 'learning_rate': 0.0007446808510638298, 'epoch': 2.55}


 26%|██▌       | 30500/117500 [19:32<54:19, 26.69it/s]  
 26%|██▌       | 30503/117500 [19:32<55:14, 26.25it/s]  

{'loss': 0.6721, 'learning_rate': 0.0007404255319148936, 'epoch': 2.6}


 26%|██▋       | 30998/117500 [19:50<50:11, 28.72it/s]  
 26%|██▋       | 31004/117500 [19:50<53:24, 26.99it/s]  

{'loss': 0.6842, 'learning_rate': 0.0007361702127659574, 'epoch': 2.64}


 27%|██▋       | 31499/117500 [20:08<53:45, 26.67it/s]  
 27%|██▋       | 31502/117500 [20:09<57:14, 25.04it/s]  

{'loss': 0.6949, 'learning_rate': 0.0007319148936170213, 'epoch': 2.68}


 27%|██▋       | 32000/117500 [20:26<50:16, 28.35it/s]
 27%|██▋       | 32003/117500 [20:26<52:39, 27.06it/s]  

{'loss': 0.6827, 'learning_rate': 0.0007276595744680852, 'epoch': 2.72}


 28%|██▊       | 32499/117500 [20:44<48:55, 28.96it/s]
 28%|██▊       | 32505/117500 [20:44<50:11, 28.22it/s]  

{'loss': 0.6818, 'learning_rate': 0.000723404255319149, 'epoch': 2.77}


 28%|██▊       | 33000/117500 [21:02<48:57, 28.76it/s]
 28%|██▊       | 33003/117500 [21:02<49:41, 28.34it/s]  

{'loss': 0.6827, 'learning_rate': 0.0007191489361702128, 'epoch': 2.81}


 29%|██▊       | 33498/117500 [21:20<51:12, 27.34it/s]  
 29%|██▊       | 33504/117500 [21:20<52:39, 26.58it/s]  

{'loss': 0.6938, 'learning_rate': 0.0007148936170212766, 'epoch': 2.85}


 29%|██▉       | 33999/117500 [21:37<48:31, 28.68it/s]
 29%|██▉       | 34005/117500 [21:38<49:32, 28.08it/s]  

{'loss': 0.6794, 'learning_rate': 0.0007106382978723405, 'epoch': 2.89}


 29%|██▉       | 34500/117500 [21:55<48:06, 28.76it/s]
 29%|██▉       | 34503/117500 [21:55<49:08, 28.15it/s]  

{'loss': 0.6823, 'learning_rate': 0.0007063829787234043, 'epoch': 2.94}


 30%|██▉       | 34998/117500 [22:13<48:46, 28.19it/s]
 30%|██▉       | 35004/117500 [22:13<50:40, 27.13it/s]  

{'loss': 0.6817, 'learning_rate': 0.0007021276595744682, 'epoch': 2.98}


 30%|███       | 35250/117500 [22:22<48:52, 28.05it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 30%|███       | 35250/117500 [22:34<48:52, 28.05it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.7248451709747314, 'eval_accuracy': {'accuracy': 0.591098629903838}, 'eval_runtime': 37.3897, 'eval_samples_per_second': 314.285, 'eval_steps_per_second': 78.578, 'epoch': 3.0}


 30%|███       | 35500/117500 [23:09<57:45, 23.66it/s]    
 30%|███       | 35503/117500 [23:09<56:18, 24.27it/s]  

{'loss': 0.6896, 'learning_rate': 0.0006978723404255319, 'epoch': 3.02}


 31%|███       | 35998/117500 [23:27<49:17, 27.55it/s]
 31%|███       | 36004/117500 [23:27<50:01, 27.15it/s]  

{'loss': 0.6825, 'learning_rate': 0.0006936170212765957, 'epoch': 3.06}


 31%|███       | 36499/117500 [23:45<47:21, 28.50it/s]
 31%|███       | 36505/117500 [23:45<48:39, 27.74it/s]  

{'loss': 0.6809, 'learning_rate': 0.0006893617021276596, 'epoch': 3.11}


 31%|███▏      | 37000/117500 [24:03<48:37, 27.59it/s]  
 31%|███▏      | 37003/117500 [24:03<49:43, 26.98it/s]  

{'loss': 0.6839, 'learning_rate': 0.0006851063829787234, 'epoch': 3.15}


 32%|███▏      | 37498/117500 [24:21<47:10, 28.26it/s]
 32%|███▏      | 37504/117500 [24:21<47:57, 27.80it/s]  

{'loss': 0.6886, 'learning_rate': 0.0006808510638297873, 'epoch': 3.19}


 32%|███▏      | 37999/117500 [24:38<46:14, 28.65it/s]
 32%|███▏      | 38005/117500 [24:39<47:17, 28.02it/s]  

{'loss': 0.6823, 'learning_rate': 0.000676595744680851, 'epoch': 3.23}


 33%|███▎      | 38500/117500 [24:56<46:33, 28.28it/s]
 33%|███▎      | 38503/117500 [24:56<47:44, 27.58it/s]  

{'loss': 0.6949, 'learning_rate': 0.0006723404255319148, 'epoch': 3.28}


 33%|███▎      | 38998/117500 [25:14<46:46, 27.97it/s]
 33%|███▎      | 39004/117500 [25:14<48:07, 27.19it/s]  

{'loss': 0.6802, 'learning_rate': 0.0006680851063829787, 'epoch': 3.32}


 34%|███▎      | 39499/117500 [25:32<44:57, 28.91it/s]
 34%|███▎      | 39502/117500 [25:32<50:00, 26.00it/s]  

{'loss': 0.6869, 'learning_rate': 0.0006638297872340425, 'epoch': 3.36}


 34%|███▍      | 40000/117500 [25:49<44:49, 28.82it/s]
 34%|███▍      | 40003/117500 [25:50<46:05, 28.02it/s]  

{'loss': 0.6896, 'learning_rate': 0.0006595744680851064, 'epoch': 3.4}


 34%|███▍      | 40498/117500 [26:07<45:57, 27.93it/s]
 34%|███▍      | 40504/117500 [26:07<47:32, 27.00it/s]  

{'loss': 0.6888, 'learning_rate': 0.0006553191489361702, 'epoch': 3.45}


 35%|███▍      | 40999/117500 [26:25<44:29, 28.66it/s]
 35%|███▍      | 41005/117500 [26:25<46:32, 27.40it/s]  

{'loss': 0.6797, 'learning_rate': 0.0006510638297872342, 'epoch': 3.49}


 35%|███▌      | 41500/117500 [26:46<46:26, 27.28it/s]  
 35%|███▌      | 41503/117500 [26:47<47:33, 26.64it/s]  

{'loss': 0.6733, 'learning_rate': 0.0006468085106382979, 'epoch': 3.53}


 36%|███▌      | 41998/117500 [27:05<46:42, 26.94it/s]
 36%|███▌      | 42004/117500 [27:05<47:24, 26.54it/s]  

{'loss': 0.6907, 'learning_rate': 0.0006425531914893617, 'epoch': 3.57}


 36%|███▌      | 42499/117500 [27:23<44:37, 28.01it/s]
 36%|███▌      | 42502/117500 [27:23<49:11, 25.41it/s]  

{'loss': 0.6669, 'learning_rate': 0.0006382978723404256, 'epoch': 3.62}


 37%|███▋      | 43000/117500 [27:41<44:30, 27.90it/s]
 37%|███▋      | 43003/117500 [27:41<45:38, 27.20it/s]  

{'loss': 0.6796, 'learning_rate': 0.0006340425531914894, 'epoch': 3.66}


 37%|███▋      | 43498/117500 [27:58<44:40, 27.61it/s]
 37%|███▋      | 43504/117500 [27:59<45:42, 26.98it/s]  

{'loss': 0.7029, 'learning_rate': 0.0006297872340425533, 'epoch': 3.7}


 37%|███▋      | 43999/117500 [28:17<44:09, 27.75it/s]
 37%|███▋      | 44005/117500 [28:17<44:47, 27.35it/s]  

{'loss': 0.6846, 'learning_rate': 0.000625531914893617, 'epoch': 3.74}


 38%|███▊      | 44500/117500 [28:35<42:27, 28.66it/s]
 38%|███▊      | 44503/117500 [28:35<45:44, 26.60it/s]  

{'loss': 0.6853, 'learning_rate': 0.0006212765957446808, 'epoch': 3.79}


 38%|███▊      | 44998/117500 [28:53<42:34, 28.38it/s]
 38%|███▊      | 45004/117500 [28:53<43:27, 27.80it/s]  

{'loss': 0.663, 'learning_rate': 0.0006170212765957447, 'epoch': 3.83}


 39%|███▊      | 45499/117500 [29:11<42:11, 28.44it/s]
 39%|███▊      | 45505/117500 [29:11<43:03, 27.86it/s]  

{'loss': 0.6763, 'learning_rate': 0.0006127659574468085, 'epoch': 3.87}


 39%|███▉      | 46000/117500 [29:29<42:30, 28.03it/s]
 39%|███▉      | 46003/117500 [29:29<45:48, 26.01it/s]  

{'loss': 0.6772, 'learning_rate': 0.0006085106382978724, 'epoch': 3.91}


 40%|███▉      | 46498/117500 [29:46<41:49, 28.29it/s]
 40%|███▉      | 46504/117500 [29:47<41:56, 28.22it/s]  

{'loss': 0.6613, 'learning_rate': 0.0006042553191489362, 'epoch': 3.96}


 40%|███▉      | 46999/117500 [30:04<41:14, 28.49it/s]
 40%|████      | 47000/117500 [30:04<41:14, 28.49it/s]  

{'loss': 0.673, 'learning_rate': 0.0006, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 40%|████      | 47000/117500 [30:14<41:14, 28.49it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.6427028179168701, 'eval_accuracy': {'accuracy': 0.6364564717896349}, 'eval_runtime': 37.481, 'eval_samples_per_second': 313.519, 'eval_steps_per_second': 78.386, 'epoch': 4.0}


 40%|████      | 47499/117500 [31:00<42:12, 27.64it/s]   
 40%|████      | 47505/117500 [31:00<43:07, 27.05it/s]  

{'loss': 0.6652, 'learning_rate': 0.0005957446808510638, 'epoch': 4.04}


 41%|████      | 48000/117500 [31:18<42:04, 27.53it/s]
 41%|████      | 48003/117500 [31:18<42:48, 27.06it/s]  

{'loss': 0.657, 'learning_rate': 0.0005914893617021276, 'epoch': 4.09}


 41%|████▏     | 48498/117500 [31:35<40:39, 28.28it/s]
 41%|████▏     | 48504/117500 [31:36<41:50, 27.48it/s]  

{'loss': 0.6705, 'learning_rate': 0.0005872340425531915, 'epoch': 4.13}


 42%|████▏     | 48999/117500 [31:53<40:13, 28.38it/s]
 42%|████▏     | 49005/117500 [31:53<41:20, 27.61it/s]  

{'loss': 0.6583, 'learning_rate': 0.0005829787234042553, 'epoch': 4.17}


 42%|████▏     | 49500/117500 [32:11<39:56, 28.37it/s]
 42%|████▏     | 49503/117500 [32:11<41:15, 27.47it/s]  

{'loss': 0.6424, 'learning_rate': 0.0005787234042553191, 'epoch': 4.21}


 43%|████▎     | 49998/117500 [32:29<39:46, 28.28it/s]  
 43%|████▎     | 50004/117500 [32:29<40:54, 27.50it/s]  

{'loss': 0.6685, 'learning_rate': 0.0005744680851063831, 'epoch': 4.26}


 43%|████▎     | 50499/117500 [32:47<39:05, 28.57it/s]
 43%|████▎     | 50505/117500 [32:48<41:21, 27.00it/s]  

{'loss': 0.6542, 'learning_rate': 0.0005702127659574468, 'epoch': 4.3}


 43%|████▎     | 51000/117500 [33:05<39:17, 28.21it/s]
 43%|████▎     | 51003/117500 [33:05<40:28, 27.39it/s]  

{'loss': 0.6762, 'learning_rate': 0.0005659574468085107, 'epoch': 4.34}


 44%|████▍     | 51498/117500 [33:23<38:55, 28.26it/s]
 44%|████▍     | 51504/117500 [33:23<40:08, 27.40it/s]  

{'loss': 0.6629, 'learning_rate': 0.0005617021276595745, 'epoch': 4.38}


 44%|████▍     | 51999/117500 [33:41<38:08, 28.63it/s]
 44%|████▍     | 52005/117500 [33:41<39:57, 27.32it/s]  

{'loss': 0.6637, 'learning_rate': 0.0005574468085106383, 'epoch': 4.43}


 45%|████▍     | 52500/117500 [33:59<39:28, 27.45it/s]
 45%|████▍     | 52503/117500 [33:59<40:29, 26.76it/s]  

{'loss': 0.6566, 'learning_rate': 0.0005531914893617022, 'epoch': 4.47}


 45%|████▌     | 52998/117500 [34:17<38:33, 27.88it/s]
 45%|████▌     | 53004/117500 [34:18<39:30, 27.21it/s]  

{'loss': 0.6749, 'learning_rate': 0.000548936170212766, 'epoch': 4.51}


 46%|████▌     | 53499/117500 [34:36<37:54, 28.13it/s]
 46%|████▌     | 53505/117500 [34:36<38:18, 27.85it/s]  

{'loss': 0.6562, 'learning_rate': 0.0005446808510638298, 'epoch': 4.55}


 46%|████▌     | 54000/117500 [34:54<36:51, 28.72it/s]
 46%|████▌     | 54003/117500 [34:54<38:49, 27.26it/s]  

{'loss': 0.6659, 'learning_rate': 0.0005404255319148936, 'epoch': 4.6}


 46%|████▋     | 54498/117500 [35:11<36:31, 28.75it/s]
 46%|████▋     | 54504/117500 [35:12<39:41, 26.46it/s]  

{'loss': 0.6541, 'learning_rate': 0.0005361702127659575, 'epoch': 4.64}


 47%|████▋     | 54999/117500 [35:29<37:28, 27.80it/s]
 47%|████▋     | 55005/117500 [35:30<39:03, 26.67it/s]  

{'loss': 0.6714, 'learning_rate': 0.0005319148936170213, 'epoch': 4.68}


 47%|████▋     | 55500/117500 [35:47<36:00, 28.70it/s]
 47%|████▋     | 55503/117500 [35:47<38:19, 26.96it/s]  

{'loss': 0.656, 'learning_rate': 0.0005276595744680851, 'epoch': 4.72}


 48%|████▊     | 55999/117500 [36:05<37:29, 27.34it/s]
 48%|████▊     | 56005/117500 [36:05<37:41, 27.19it/s]  

{'loss': 0.6343, 'learning_rate': 0.000523404255319149, 'epoch': 4.77}


 48%|████▊     | 56500/117500 [36:23<35:14, 28.84it/s]
 48%|████▊     | 56503/117500 [36:23<36:42, 27.70it/s]  

{'loss': 0.6717, 'learning_rate': 0.0005191489361702127, 'epoch': 4.81}


 49%|████▊     | 56998/117500 [36:40<35:18, 28.56it/s]
 49%|████▊     | 57004/117500 [36:41<35:54, 28.08it/s]  

{'loss': 0.6781, 'learning_rate': 0.0005148936170212766, 'epoch': 4.85}


 49%|████▉     | 57499/117500 [36:58<34:52, 28.67it/s]
 49%|████▉     | 57505/117500 [36:58<36:18, 27.54it/s]  

{'loss': 0.641, 'learning_rate': 0.0005106382978723404, 'epoch': 4.89}


 49%|████▉     | 58000/117500 [37:16<39:20, 25.21it/s]
 49%|████▉     | 58003/117500 [37:16<39:44, 24.95it/s]  

{'loss': 0.6687, 'learning_rate': 0.0005063829787234042, 'epoch': 4.94}


 50%|████▉     | 58498/117500 [37:34<34:24, 28.58it/s]
 50%|████▉     | 58504/117500 [37:34<35:39, 27.57it/s]  

{'loss': 0.6743, 'learning_rate': 0.0005021276595744681, 'epoch': 4.98}


 50%|█████     | 58750/117500 [37:43<35:12, 27.80it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 50%|█████     | 58750/117500 [37:54<35:12, 27.80it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.6294941902160645, 'eval_accuracy': {'accuracy': 0.6469236660709727}, 'eval_runtime': 37.5722, 'eval_samples_per_second': 312.758, 'eval_steps_per_second': 78.196, 'epoch': 5.0}


 50%|█████     | 59000/117500 [38:30<34:17, 28.43it/s]   
 50%|█████     | 59003/117500 [38:30<37:35, 25.94it/s]  

{'loss': 0.6519, 'learning_rate': 0.000497872340425532, 'epoch': 5.02}


 51%|█████     | 59498/117500 [38:47<36:31, 26.47it/s]
 51%|█████     | 59504/117500 [38:48<36:21, 26.59it/s]  

{'loss': 0.6588, 'learning_rate': 0.0004936170212765957, 'epoch': 5.06}


 51%|█████     | 59999/117500 [39:06<36:25, 26.31it/s]
 51%|█████     | 60005/117500 [39:06<36:58, 25.92it/s]  

{'loss': 0.6552, 'learning_rate': 0.0004893617021276596, 'epoch': 5.11}


 51%|█████▏    | 60500/117500 [39:25<33:25, 28.42it/s]
 51%|█████▏    | 60503/117500 [39:25<34:50, 27.27it/s]  

{'loss': 0.6619, 'learning_rate': 0.0004851063829787234, 'epoch': 5.15}


 52%|█████▏    | 60998/117500 [39:42<32:48, 28.71it/s]
 52%|█████▏    | 61004/117500 [39:42<33:50, 27.83it/s]  

{'loss': 0.6572, 'learning_rate': 0.00048085106382978723, 'epoch': 5.19}


 52%|█████▏    | 61499/117500 [40:00<32:29, 28.73it/s]
 52%|█████▏    | 61505/117500 [40:00<34:00, 27.44it/s]  

{'loss': 0.6456, 'learning_rate': 0.0004765957446808511, 'epoch': 5.23}


 53%|█████▎    | 62000/117500 [40:18<32:56, 28.08it/s]
 53%|█████▎    | 62003/117500 [40:18<33:46, 27.38it/s]  

{'loss': 0.6442, 'learning_rate': 0.0004723404255319149, 'epoch': 5.28}


 53%|█████▎    | 62498/117500 [40:36<32:36, 28.11it/s]
 53%|█████▎    | 62504/117500 [40:36<33:04, 27.72it/s]  

{'loss': 0.6554, 'learning_rate': 0.00046808510638297874, 'epoch': 5.32}


 54%|█████▎    | 62999/117500 [40:53<31:46, 28.58it/s]
 54%|█████▎    | 63005/117500 [40:54<32:41, 27.78it/s]  

{'loss': 0.6495, 'learning_rate': 0.00046382978723404257, 'epoch': 5.36}


 54%|█████▍    | 63500/117500 [41:12<32:00, 28.12it/s]
 54%|█████▍    | 63503/117500 [41:12<32:44, 27.49it/s]  

{'loss': 0.645, 'learning_rate': 0.0004595744680851064, 'epoch': 5.4}


 54%|█████▍    | 63998/117500 [41:30<30:56, 28.82it/s]
 54%|█████▍    | 64004/117500 [41:30<31:50, 28.00it/s]  

{'loss': 0.6659, 'learning_rate': 0.00045531914893617024, 'epoch': 5.45}


 55%|█████▍    | 64499/117500 [41:48<35:07, 25.15it/s]
 55%|█████▍    | 64505/117500 [41:48<34:23, 25.68it/s]  

{'loss': 0.6479, 'learning_rate': 0.000451063829787234, 'epoch': 5.49}


 55%|█████▌    | 65000/117500 [42:06<31:48, 27.52it/s]
 55%|█████▌    | 65003/117500 [42:06<32:21, 27.04it/s]  

{'loss': 0.6442, 'learning_rate': 0.00044680851063829785, 'epoch': 5.53}


 56%|█████▌    | 65498/117500 [42:23<30:38, 28.28it/s]
 56%|█████▌    | 65504/117500 [42:24<31:35, 27.43it/s]  

{'loss': 0.6559, 'learning_rate': 0.0004425531914893617, 'epoch': 5.57}


 56%|█████▌    | 65999/117500 [42:41<29:52, 28.73it/s]
 56%|█████▌    | 66005/117500 [42:42<30:49, 27.84it/s]  

{'loss': 0.6283, 'learning_rate': 0.00043829787234042557, 'epoch': 5.62}


 57%|█████▋    | 66500/117500 [42:59<29:40, 28.65it/s]
 57%|█████▋    | 66503/117500 [42:59<30:39, 27.72it/s]  

{'loss': 0.6608, 'learning_rate': 0.0004340425531914894, 'epoch': 5.66}


 57%|█████▋    | 66998/117500 [43:17<29:24, 28.62it/s]
 57%|█████▋    | 67004/117500 [43:17<30:37, 27.48it/s]  

{'loss': 0.6433, 'learning_rate': 0.0004297872340425532, 'epoch': 5.7}


 57%|█████▋    | 67500/117500 [43:35<29:39, 28.10it/s]
 57%|█████▋    | 67503/117500 [43:35<30:05, 27.70it/s]  

{'loss': 0.6498, 'learning_rate': 0.000425531914893617, 'epoch': 5.74}


 58%|█████▊    | 67998/117500 [43:53<29:12, 28.24it/s]
 58%|█████▊    | 68004/117500 [43:53<29:50, 27.64it/s]  

{'loss': 0.6536, 'learning_rate': 0.00042127659574468085, 'epoch': 5.79}


 58%|█████▊    | 68499/117500 [44:11<28:20, 28.81it/s]
 58%|█████▊    | 68505/117500 [44:11<28:52, 28.29it/s]  

{'loss': 0.6431, 'learning_rate': 0.0004170212765957447, 'epoch': 5.83}


 59%|█████▊    | 69000/117500 [44:29<28:12, 28.65it/s]
 59%|█████▊    | 69003/117500 [44:29<28:59, 27.87it/s]  

{'loss': 0.6386, 'learning_rate': 0.0004127659574468085, 'epoch': 5.87}


 59%|█████▉    | 69498/117500 [44:47<28:01, 28.54it/s]
 59%|█████▉    | 69504/117500 [44:47<28:34, 27.99it/s]  

{'loss': 0.6532, 'learning_rate': 0.00040851063829787235, 'epoch': 5.91}


 60%|█████▉    | 70000/117500 [45:05<29:01, 27.27it/s]
 60%|█████▉    | 70003/117500 [45:05<29:51, 26.52it/s]  

{'loss': 0.6388, 'learning_rate': 0.00040425531914893613, 'epoch': 5.96}


 60%|█████▉    | 70498/117500 [45:23<27:22, 28.62it/s]
 60%|██████    | 70500/117500 [45:23<27:22, 28.62it/s]  

{'loss': 0.6447, 'learning_rate': 0.0004, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 60%|██████    | 70500/117500 [45:35<27:22, 28.62it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.6198561191558838, 'eval_accuracy': {'accuracy': 0.6636881967492129}, 'eval_runtime': 37.6672, 'eval_samples_per_second': 311.969, 'eval_steps_per_second': 77.999, 'epoch': 6.0}


 60%|██████    | 70999/117500 [46:18<26:59, 28.72it/s]   
 60%|██████    | 71002/117500 [46:19<29:34, 26.20it/s]  

{'loss': 0.6394, 'learning_rate': 0.00039574468085106385, 'epoch': 6.04}


 61%|██████    | 71500/117500 [46:36<27:27, 27.93it/s]
 61%|██████    | 71503/117500 [46:36<28:26, 26.95it/s]  

{'loss': 0.6419, 'learning_rate': 0.0003914893617021277, 'epoch': 6.09}


 61%|██████▏   | 71998/117500 [46:54<26:16, 28.86it/s]
 61%|██████▏   | 72004/117500 [46:54<27:34, 27.50it/s]  

{'loss': 0.6501, 'learning_rate': 0.0003872340425531915, 'epoch': 6.13}


 62%|██████▏   | 72499/117500 [47:12<26:55, 27.85it/s]
 62%|██████▏   | 72505/117500 [47:13<27:02, 27.74it/s]  

{'loss': 0.6505, 'learning_rate': 0.0003829787234042553, 'epoch': 6.17}


 62%|██████▏   | 73000/117500 [47:30<27:09, 27.31it/s]
 62%|██████▏   | 73003/117500 [47:30<27:35, 26.88it/s]  

{'loss': 0.6412, 'learning_rate': 0.00037872340425531913, 'epoch': 6.21}


 63%|██████▎   | 73498/117500 [47:48<25:34, 28.67it/s]
 63%|██████▎   | 73504/117500 [47:49<26:05, 28.10it/s]  

{'loss': 0.6405, 'learning_rate': 0.00037446808510638297, 'epoch': 6.26}


 63%|██████▎   | 73999/117500 [48:06<27:03, 26.80it/s]
 63%|██████▎   | 74002/117500 [48:06<28:36, 25.34it/s]  

{'loss': 0.6468, 'learning_rate': 0.0003702127659574468, 'epoch': 6.3}


 63%|██████▎   | 74500/117500 [48:25<25:20, 28.29it/s]
 63%|██████▎   | 74503/117500 [48:25<27:06, 26.43it/s]  

{'loss': 0.6488, 'learning_rate': 0.00036595744680851063, 'epoch': 6.34}


 64%|██████▍   | 74998/117500 [48:42<27:19, 25.92it/s]
 64%|██████▍   | 75004/117500 [48:43<27:04, 26.17it/s]  

{'loss': 0.6619, 'learning_rate': 0.0003617021276595745, 'epoch': 6.38}


 64%|██████▍   | 75499/117500 [49:01<25:07, 27.86it/s]
 64%|██████▍   | 75505/117500 [49:01<25:24, 27.55it/s]  

{'loss': 0.6444, 'learning_rate': 0.0003574468085106383, 'epoch': 6.43}


 65%|██████▍   | 76000/117500 [49:19<24:59, 27.68it/s]
 65%|██████▍   | 76003/117500 [49:19<25:51, 26.75it/s]  

{'loss': 0.6655, 'learning_rate': 0.00035319148936170213, 'epoch': 6.47}


 65%|██████▌   | 76498/117500 [49:36<24:15, 28.17it/s]
 65%|██████▌   | 76504/117500 [49:37<25:41, 26.60it/s]    

{'loss': 0.6685, 'learning_rate': 0.00034893617021276597, 'epoch': 6.51}


 66%|██████▌   | 76999/117500 [49:54<23:49, 28.33it/s]
 66%|██████▌   | 77005/117500 [49:55<25:01, 26.97it/s]    

{'loss': 0.6411, 'learning_rate': 0.0003446808510638298, 'epoch': 6.55}


 66%|██████▌   | 77500/117500 [50:12<24:19, 27.40it/s]
 66%|██████▌   | 77503/117500 [50:13<24:28, 27.23it/s]    

{'loss': 0.6521, 'learning_rate': 0.00034042553191489364, 'epoch': 6.6}


 66%|██████▋   | 77998/117500 [50:30<23:03, 28.55it/s]
 66%|██████▋   | 78004/117500 [50:30<24:06, 27.31it/s]    

{'loss': 0.6402, 'learning_rate': 0.0003361702127659574, 'epoch': 6.64}


 67%|██████▋   | 78499/117500 [50:48<22:59, 28.27it/s]
 67%|██████▋   | 78505/117500 [50:48<23:37, 27.51it/s]    

{'loss': 0.6355, 'learning_rate': 0.00033191489361702125, 'epoch': 6.68}


 67%|██████▋   | 79000/117500 [51:06<23:52, 26.88it/s]
 67%|██████▋   | 79003/117500 [51:06<24:56, 25.72it/s]    

{'loss': 0.6468, 'learning_rate': 0.0003276595744680851, 'epoch': 6.72}


 68%|██████▊   | 79498/117500 [51:24<22:55, 27.63it/s]
 68%|██████▊   | 79504/117500 [51:24<23:24, 27.05it/s]    

{'loss': 0.657, 'learning_rate': 0.00032340425531914897, 'epoch': 6.77}


 68%|██████▊   | 79999/117500 [51:42<22:39, 27.58it/s]
 68%|██████▊   | 80005/117500 [51:42<22:55, 27.25it/s]    

{'loss': 0.6542, 'learning_rate': 0.0003191489361702128, 'epoch': 6.81}


 69%|██████▊   | 80500/117500 [52:00<21:44, 28.35it/s]
 69%|██████▊   | 80503/117500 [52:00<22:13, 27.75it/s]    

{'loss': 0.6354, 'learning_rate': 0.00031489361702127664, 'epoch': 6.85}


 69%|██████▉   | 80998/117500 [52:18<21:44, 27.99it/s]
 69%|██████▉   | 81004/117500 [52:18<22:44, 26.74it/s]    

{'loss': 0.6414, 'learning_rate': 0.0003106382978723404, 'epoch': 6.89}


 69%|██████▉   | 81499/117500 [52:36<23:20, 25.71it/s]
 69%|██████▉   | 81502/117500 [52:36<23:54, 25.09it/s]    

{'loss': 0.6478, 'learning_rate': 0.00030638297872340425, 'epoch': 6.94}


 70%|██████▉   | 82000/117500 [52:54<20:51, 28.37it/s]
 70%|██████▉   | 82003/117500 [52:54<21:35, 27.40it/s]    

{'loss': 0.6376, 'learning_rate': 0.0003021276595744681, 'epoch': 6.98}


 70%|██████▉   | 82249/117500 [53:03<21:39, 27.13it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 70%|███████   | 82250/117500 [53:15<21:39, 27.13it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.6186705827713013, 'eval_accuracy': {'accuracy': 0.6619862139392392}, 'eval_runtime': 37.986, 'eval_samples_per_second': 309.351, 'eval_steps_per_second': 77.344, 'epoch': 7.0}


 70%|███████   | 82500/117500 [53:50<20:23, 28.61it/s]   
 70%|███████   | 82503/117500 [53:50<20:58, 27.81it/s]    

{'loss': 0.6463, 'learning_rate': 0.0002978723404255319, 'epoch': 7.02}


 71%|███████   | 82998/117500 [54:08<21:03, 27.30it/s]
 71%|███████   | 83004/117500 [54:08<21:48, 26.36it/s]    

{'loss': 0.6435, 'learning_rate': 0.00029361702127659575, 'epoch': 7.06}


 71%|███████   | 83499/117500 [54:26<20:06, 28.18it/s]
 71%|███████   | 83505/117500 [54:26<20:24, 27.76it/s]    

{'loss': 0.6204, 'learning_rate': 0.00028936170212765953, 'epoch': 7.11}


 71%|███████▏  | 84000/117500 [54:43<19:51, 28.12it/s]
 71%|███████▏  | 84003/117500 [54:43<20:20, 27.45it/s]    

{'loss': 0.6597, 'learning_rate': 0.0002851063829787234, 'epoch': 7.15}


 72%|███████▏  | 84498/117500 [55:01<20:02, 27.44it/s]
 72%|███████▏  | 84504/117500 [55:01<20:36, 26.69it/s]    

{'loss': 0.6204, 'learning_rate': 0.00028085106382978725, 'epoch': 7.19}


 72%|███████▏  | 84999/117500 [55:19<18:45, 28.87it/s]
 72%|███████▏  | 85005/117500 [55:19<19:44, 27.44it/s]    

{'loss': 0.6411, 'learning_rate': 0.0002765957446808511, 'epoch': 7.23}


 73%|███████▎  | 85500/117500 [55:37<18:34, 28.71it/s]
 73%|███████▎  | 85503/117500 [55:37<20:08, 26.48it/s]    

{'loss': 0.643, 'learning_rate': 0.0002723404255319149, 'epoch': 7.28}


 73%|███████▎  | 85998/117500 [55:55<18:59, 27.65it/s]
 73%|███████▎  | 86004/117500 [55:55<20:49, 25.20it/s]    

{'loss': 0.6312, 'learning_rate': 0.00026808510638297875, 'epoch': 7.32}


 74%|███████▎  | 86499/117500 [56:13<18:08, 28.48it/s]
 74%|███████▎  | 86502/117500 [56:13<19:32, 26.43it/s]    

{'loss': 0.6559, 'learning_rate': 0.00026382978723404253, 'epoch': 7.36}


 74%|███████▍  | 87000/117500 [56:30<17:40, 28.75it/s]
 74%|███████▍  | 87003/117500 [56:30<18:16, 27.81it/s]    

{'loss': 0.6195, 'learning_rate': 0.00025957446808510637, 'epoch': 7.4}


 74%|███████▍  | 87498/117500 [56:48<17:31, 28.54it/s]
 74%|███████▍  | 87504/117500 [56:48<17:50, 28.01it/s]    

{'loss': 0.6547, 'learning_rate': 0.0002553191489361702, 'epoch': 7.45}


 75%|███████▍  | 87999/117500 [57:06<17:16, 28.46it/s]
 75%|███████▍  | 88005/117500 [57:06<18:07, 27.12it/s]    

{'loss': 0.6536, 'learning_rate': 0.00025106382978723403, 'epoch': 7.49}


 75%|███████▌  | 88500/117500 [57:24<17:27, 27.67it/s]
 75%|███████▌  | 88503/117500 [57:24<18:57, 25.49it/s]    

{'loss': 0.6328, 'learning_rate': 0.00024680851063829787, 'epoch': 7.53}


 76%|███████▌  | 88998/117500 [57:42<17:35, 27.00it/s]
 76%|███████▌  | 89004/117500 [57:42<18:58, 25.03it/s]    

{'loss': 0.636, 'learning_rate': 0.0002425531914893617, 'epoch': 7.57}


 76%|███████▌  | 89499/117500 [58:00<16:11, 28.84it/s]
 76%|███████▌  | 89505/117500 [58:00<17:11, 27.14it/s]    

{'loss': 0.6245, 'learning_rate': 0.00023829787234042556, 'epoch': 7.62}


 77%|███████▋  | 90000/117500 [58:18<16:11, 28.31it/s]
 77%|███████▋  | 90003/117500 [58:18<16:38, 27.55it/s]    

{'loss': 0.6429, 'learning_rate': 0.00023404255319148937, 'epoch': 7.66}


 77%|███████▋  | 90498/117500 [58:36<15:54, 28.30it/s]
 77%|███████▋  | 90504/117500 [58:37<16:12, 27.75it/s]    

{'loss': 0.6348, 'learning_rate': 0.0002297872340425532, 'epoch': 7.7}


 77%|███████▋  | 90999/117500 [58:54<15:31, 28.46it/s]
 77%|███████▋  | 91005/117500 [58:55<15:50, 27.86it/s]    

{'loss': 0.6255, 'learning_rate': 0.000225531914893617, 'epoch': 7.74}


 78%|███████▊  | 91500/117500 [59:12<15:15, 28.40it/s]
 78%|███████▊  | 91503/117500 [59:12<15:39, 27.69it/s]    

{'loss': 0.6366, 'learning_rate': 0.00022127659574468084, 'epoch': 7.79}


 78%|███████▊  | 91998/117500 [59:30<14:48, 28.72it/s]
 78%|███████▊  | 92004/117500 [59:30<15:21, 27.67it/s]    

{'loss': 0.6496, 'learning_rate': 0.0002170212765957447, 'epoch': 7.83}


 79%|███████▊  | 92499/117500 [59:48<14:48, 28.14it/s]
 79%|███████▊  | 92502/117500 [59:48<15:56, 26.13it/s]    

{'loss': 0.6471, 'learning_rate': 0.0002127659574468085, 'epoch': 7.87}


 79%|███████▉  | 93000/117500 [1:00:06<14:26, 28.28it/s]
 79%|███████▉  | 93003/117500 [1:00:06<14:57, 27.31it/s]  

{'loss': 0.6394, 'learning_rate': 0.00020851063829787234, 'epoch': 7.91}


 80%|███████▉  | 93498/117500 [1:00:24<14:10, 28.22it/s]
 80%|███████▉  | 93504/117500 [1:00:24<14:38, 27.31it/s]  

{'loss': 0.6489, 'learning_rate': 0.00020425531914893618, 'epoch': 7.96}


 80%|███████▉  | 93999/117500 [1:00:42<13:53, 28.18it/s]
 80%|████████  | 94000/117500 [1:00:42<13:53, 28.18it/s]  

{'loss': 0.652, 'learning_rate': 0.0002, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 80%|████████  | 94000/117500 [1:00:55<13:53, 28.18it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.6089786887168884, 'eval_accuracy': {'accuracy': 0.676708365245511}, 'eval_runtime': 37.7441, 'eval_samples_per_second': 311.334, 'eval_steps_per_second': 77.84, 'epoch': 8.0}


 80%|████████  | 94499/117500 [1:01:38<13:30, 28.38it/s]   
 80%|████████  | 94505/117500 [1:01:38<13:42, 27.95it/s]  

{'loss': 0.6399, 'learning_rate': 0.00019574468085106384, 'epoch': 8.04}


 81%|████████  | 95000/117500 [1:01:56<13:19, 28.14it/s]
 81%|████████  | 95003/117500 [1:01:56<13:40, 27.41it/s]  

{'loss': 0.6425, 'learning_rate': 0.00019148936170212765, 'epoch': 8.09}


 81%|████████▏ | 95498/117500 [1:02:14<12:57, 28.29it/s]
 81%|████████▏ | 95504/117500 [1:02:14<13:13, 27.71it/s]  

{'loss': 0.6381, 'learning_rate': 0.00018723404255319148, 'epoch': 8.13}


 82%|████████▏ | 96000/117500 [1:02:32<12:46, 28.06it/s]
 82%|████████▏ | 96003/117500 [1:02:33<13:22, 26.77it/s]  

{'loss': 0.6504, 'learning_rate': 0.00018297872340425532, 'epoch': 8.17}


 82%|████████▏ | 96498/117500 [1:02:50<12:12, 28.66it/s]
 82%|████████▏ | 96504/117500 [1:02:51<12:24, 28.20it/s]  

{'loss': 0.6452, 'learning_rate': 0.00017872340425531915, 'epoch': 8.21}


 83%|████████▎ | 96999/117500 [1:03:08<11:56, 28.62it/s]
 83%|████████▎ | 97005/117500 [1:03:08<12:16, 27.85it/s]  

{'loss': 0.6392, 'learning_rate': 0.00017446808510638298, 'epoch': 8.26}


 83%|████████▎ | 97499/117500 [1:03:26<12:14, 27.25it/s]
 83%|████████▎ | 97505/117500 [1:03:26<12:31, 26.61it/s]  

{'loss': 0.6316, 'learning_rate': 0.00017021276595744682, 'epoch': 8.3}


 83%|████████▎ | 98000/117500 [1:03:45<11:24, 28.50it/s]
 83%|████████▎ | 98003/117500 [1:03:45<12:21, 26.29it/s]  

{'loss': 0.6302, 'learning_rate': 0.00016595744680851062, 'epoch': 8.34}


 84%|████████▍ | 98498/117500 [1:04:03<11:17, 28.03it/s]
 84%|████████▍ | 98504/117500 [1:04:03<11:40, 27.13it/s]  

{'loss': 0.6172, 'learning_rate': 0.00016170212765957449, 'epoch': 8.38}


 84%|████████▍ | 98999/117500 [1:04:21<11:19, 27.22it/s]
 84%|████████▍ | 99005/117500 [1:04:21<11:17, 27.32it/s]  

{'loss': 0.6115, 'learning_rate': 0.00015744680851063832, 'epoch': 8.43}


 85%|████████▍ | 99500/117500 [1:04:39<10:33, 28.40it/s]
 85%|████████▍ | 99503/117500 [1:04:39<10:55, 27.47it/s]  

{'loss': 0.6356, 'learning_rate': 0.00015319148936170213, 'epoch': 8.47}


 85%|████████▌ | 99998/117500 [1:04:57<10:20, 28.20it/s]
 85%|████████▌ | 100004/117500 [1:04:57<10:36, 27.48it/s] 

{'loss': 0.6174, 'learning_rate': 0.00014893617021276596, 'epoch': 8.51}


 86%|████████▌ | 100499/117500 [1:05:15<09:59, 28.37it/s]
 86%|████████▌ | 100505/117500 [1:05:15<10:12, 27.74it/s] 

{'loss': 0.6286, 'learning_rate': 0.00014468085106382977, 'epoch': 8.55}


 86%|████████▌ | 101000/117500 [1:05:32<09:37, 28.59it/s]
 86%|████████▌ | 101003/117500 [1:05:32<09:53, 27.78it/s] 

{'loss': 0.6415, 'learning_rate': 0.00014042553191489363, 'epoch': 8.6}


 86%|████████▋ | 101498/117500 [1:05:50<09:21, 28.51it/s]
 86%|████████▋ | 101504/117500 [1:05:50<09:37, 27.68it/s] 

{'loss': 0.643, 'learning_rate': 0.00013617021276595746, 'epoch': 8.64}


 87%|████████▋ | 101999/117500 [1:06:08<09:05, 28.40it/s]
 87%|████████▋ | 102002/117500 [1:06:08<09:54, 26.06it/s] 

{'loss': 0.6368, 'learning_rate': 0.00013191489361702127, 'epoch': 8.68}


 87%|████████▋ | 102500/117500 [1:06:26<08:49, 28.35it/s]
 87%|████████▋ | 102503/117500 [1:06:26<09:04, 27.55it/s] 

{'loss': 0.6156, 'learning_rate': 0.0001276595744680851, 'epoch': 8.72}


 88%|████████▊ | 102998/117500 [1:06:43<08:35, 28.11it/s]
 88%|████████▊ | 103004/117500 [1:06:44<08:47, 27.47it/s] 

{'loss': 0.6291, 'learning_rate': 0.00012340425531914893, 'epoch': 8.77}


 88%|████████▊ | 103499/117500 [1:07:02<08:14, 28.29it/s]
 88%|████████▊ | 103505/117500 [1:07:02<08:33, 27.23it/s] 

{'loss': 0.6365, 'learning_rate': 0.00011914893617021278, 'epoch': 8.81}


 89%|████████▊ | 104000/117500 [1:07:20<07:57, 28.28it/s]
 89%|████████▊ | 104003/117500 [1:07:20<08:02, 27.96it/s] 

{'loss': 0.6484, 'learning_rate': 0.0001148936170212766, 'epoch': 8.85}


 89%|████████▉ | 104498/117500 [1:07:38<07:46, 27.88it/s]
 89%|████████▉ | 104504/117500 [1:07:38<08:18, 26.06it/s] 

{'loss': 0.6257, 'learning_rate': 0.00011063829787234042, 'epoch': 8.89}


 89%|████████▉ | 104999/117500 [1:07:56<07:28, 27.86it/s]
 89%|████████▉ | 105005/117500 [1:07:56<07:40, 27.15it/s] 

{'loss': 0.6387, 'learning_rate': 0.00010638297872340425, 'epoch': 8.94}


 90%|████████▉ | 105500/117500 [1:08:14<07:14, 27.64it/s]
 90%|████████▉ | 105503/117500 [1:08:14<07:20, 27.22it/s] 

{'loss': 0.6342, 'learning_rate': 0.00010212765957446809, 'epoch': 8.98}


 90%|████████▉ | 105749/117500 [1:08:23<06:55, 28.25it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 90%|█████████ | 105750/117500 [1:08:35<06:55, 28.25it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.6131450533866882, 'eval_accuracy': {'accuracy': 0.6802825291464556}, 'eval_runtime': 37.85, 'eval_samples_per_second': 310.463, 'eval_steps_per_second': 77.622, 'epoch': 9.0}


 90%|█████████ | 106000/117500 [1:09:10<06:49, 28.12it/s]   
 90%|█████████ | 106003/117500 [1:09:10<07:00, 27.35it/s] 

{'loss': 0.6261, 'learning_rate': 9.787234042553192e-05, 'epoch': 9.02}


 91%|█████████ | 106498/117500 [1:09:27<06:29, 28.27it/s]
 91%|█████████ | 106504/117500 [1:09:28<06:59, 26.22it/s] 

{'loss': 0.6424, 'learning_rate': 9.361702127659574e-05, 'epoch': 9.06}


 91%|█████████ | 106999/117500 [1:09:46<06:17, 27.80it/s]
 91%|█████████ | 107005/117500 [1:09:46<06:23, 27.39it/s] 

{'loss': 0.6279, 'learning_rate': 8.936170212765958e-05, 'epoch': 9.11}


 91%|█████████▏| 107500/117500 [1:10:04<06:24, 26.00it/s]
 91%|█████████▏| 107503/117500 [1:10:04<06:42, 24.87it/s] 

{'loss': 0.6304, 'learning_rate': 8.510638297872341e-05, 'epoch': 9.15}


 92%|█████████▏| 107998/117500 [1:10:22<05:34, 28.42it/s]
 92%|█████████▏| 108004/117500 [1:10:23<05:42, 27.70it/s] 

{'loss': 0.6283, 'learning_rate': 8.085106382978724e-05, 'epoch': 9.19}


 92%|█████████▏| 108499/117500 [1:10:40<05:17, 28.38it/s]
 92%|█████████▏| 108505/117500 [1:10:41<05:22, 27.86it/s] 

{'loss': 0.6495, 'learning_rate': 7.659574468085106e-05, 'epoch': 9.23}


 93%|█████████▎| 109000/117500 [1:10:58<04:57, 28.53it/s]
 93%|█████████▎| 109003/117500 [1:10:59<05:11, 27.24it/s] 

{'loss': 0.6263, 'learning_rate': 7.234042553191488e-05, 'epoch': 9.28}


 93%|█████████▎| 109498/117500 [1:11:16<04:40, 28.54it/s]
 93%|█████████▎| 109504/117500 [1:11:16<04:51, 27.45it/s] 

{'loss': 0.6256, 'learning_rate': 6.808510638297873e-05, 'epoch': 9.32}


 94%|█████████▎| 109999/117500 [1:11:35<04:31, 27.60it/s]
 94%|█████████▎| 110005/117500 [1:11:35<04:33, 27.38it/s] 

{'loss': 0.6272, 'learning_rate': 6.382978723404255e-05, 'epoch': 9.36}


 94%|█████████▍| 110500/117500 [1:11:53<04:03, 28.75it/s]
 94%|█████████▍| 110503/117500 [1:11:53<04:13, 27.55it/s] 

{'loss': 0.6368, 'learning_rate': 5.957446808510639e-05, 'epoch': 9.4}


 94%|█████████▍| 110998/117500 [1:12:11<03:54, 27.78it/s]
 94%|█████████▍| 111004/117500 [1:12:11<03:58, 27.26it/s] 

{'loss': 0.6034, 'learning_rate': 5.531914893617021e-05, 'epoch': 9.45}


 95%|█████████▍| 111499/117500 [1:12:29<03:36, 27.68it/s]
 95%|█████████▍| 111505/117500 [1:12:29<03:43, 26.81it/s] 

{'loss': 0.644, 'learning_rate': 5.1063829787234044e-05, 'epoch': 9.49}


 95%|█████████▌| 112000/117500 [1:12:47<03:15, 28.17it/s]
 95%|█████████▌| 112003/117500 [1:12:47<03:23, 27.02it/s] 

{'loss': 0.6329, 'learning_rate': 4.680851063829787e-05, 'epoch': 9.53}


 96%|█████████▌| 112498/117500 [1:13:05<02:57, 28.15it/s]
 96%|█████████▌| 112504/117500 [1:13:05<03:06, 26.73it/s] 

{'loss': 0.618, 'learning_rate': 4.2553191489361704e-05, 'epoch': 9.57}


 96%|█████████▌| 112999/117500 [1:13:23<02:56, 25.51it/s]
 96%|█████████▌| 113005/117500 [1:13:23<02:53, 25.84it/s] 

{'loss': 0.6171, 'learning_rate': 3.829787234042553e-05, 'epoch': 9.62}


 97%|█████████▋| 113500/117500 [1:13:41<02:20, 28.51it/s]
 97%|█████████▋| 113503/117500 [1:13:41<02:29, 26.76it/s] 

{'loss': 0.636, 'learning_rate': 3.4042553191489365e-05, 'epoch': 9.66}


 97%|█████████▋| 113998/117500 [1:13:59<02:03, 28.31it/s]
 97%|█████████▋| 114004/117500 [1:13:59<02:09, 26.94it/s] 

{'loss': 0.6319, 'learning_rate': 2.9787234042553195e-05, 'epoch': 9.7}


 97%|█████████▋| 114499/117500 [1:14:17<01:46, 28.27it/s]
 97%|█████████▋| 114505/117500 [1:14:17<01:46, 28.11it/s] 

{'loss': 0.6301, 'learning_rate': 2.5531914893617022e-05, 'epoch': 9.74}


 98%|█████████▊| 115000/117500 [1:14:35<01:28, 28.25it/s]
 98%|█████████▊| 115003/117500 [1:14:35<01:30, 27.47it/s] 

{'loss': 0.6165, 'learning_rate': 2.1276595744680852e-05, 'epoch': 9.79}


 98%|█████████▊| 115498/117500 [1:14:53<01:10, 28.56it/s]
 98%|█████████▊| 115504/117500 [1:14:53<01:11, 27.96it/s] 

{'loss': 0.639, 'learning_rate': 1.7021276595744682e-05, 'epoch': 9.83}


 99%|█████████▊| 115999/117500 [1:15:11<00:52, 28.35it/s]
 99%|█████████▊| 116005/117500 [1:15:11<00:53, 27.70it/s] 

{'loss': 0.633, 'learning_rate': 1.2765957446808511e-05, 'epoch': 9.87}


 99%|█████████▉| 116500/117500 [1:15:29<00:34, 28.69it/s]
 99%|█████████▉| 116503/117500 [1:15:29<00:35, 27.81it/s] 

{'loss': 0.6125, 'learning_rate': 8.510638297872341e-06, 'epoch': 9.91}


100%|█████████▉| 116998/117500 [1:15:46<00:17, 28.59it/s]
100%|█████████▉| 117004/117500 [1:15:46<00:18, 27.45it/s] 

{'loss': 0.6279, 'learning_rate': 4.255319148936171e-06, 'epoch': 9.96}


100%|█████████▉| 117499/117500 [1:16:04<00:00, 26.44it/s]
100%|██████████| 117500/117500 [1:16:04<00:00, 26.44it/s] 

{'loss': 0.6256, 'learning_rate': 0.0, 'epoch': 10.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 117500/117500 [1:16:15<00:00, 26.44it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.6084820032119751, 'eval_accuracy': {'accuracy': 0.6867500638243553}, 'eval_runtime': 38.3153, 'eval_samples_per_second': 306.692, 'eval_steps_per_second': 76.68, 'epoch': 10.0}



100%|██████████| 117500/117500 [1:16:43<00:00, 25.52it/s] 

{'train_runtime': 4603.5921, 'train_samples_per_second': 102.094, 'train_steps_per_second': 25.524, 'train_loss': 0.6547205428752494, 'epoch': 10.0}





TrainOutput(global_step=117500, training_loss=0.6547205428752494, metrics={'train_runtime': 4603.5921, 'train_samples_per_second': 102.094, 'train_steps_per_second': 25.524, 'train_loss': 0.6547205428752494, 'epoch': 10.0})

In [64]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
هذا رائع - Positive
ليس جيداً بالمطلق - Negative
يمكن تحسين الوضع. - Negative
لنتأمل أن يتحسن - Negative
لا أعرف بالضبط - Negative
