# Arabic Fine Tuning Test
- dataset extracted from: [Twitter Corpus](https://www.kaggle.com/datasets/mksaad/arabic-sentiment-twitter-corpus)
- Cleaning logic inspired from [arabic-sentiment-twitter-corpus](https://www.kaggle.com/datasets/mksaad/arabic-sentiment-twitter-corpus)

## Load datasets

In [None]:
! pip install numpy
! pip install pandas
! pip install transformers
! pip install peft
! pip install torch
! pip install evaluate
! pip install emoji
! pip install scikit-learn

In [43]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
import csv
import unicodedata
import re
import emoji

In [44]:
# Load training subset
train_neg = pd.read_csv("dataset/train_Arabic_tweets_negative.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
train_neg.rename(columns={0:'label', 1:'tweet'}, inplace=True)
train_neg['label'] = 0

train_pos = pd.read_csv("dataset/train_Arabic_tweets_positive.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
train_pos.rename(columns={0:'label', 1:'tweet'}, inplace=True)
train_pos['label'] = 1

train_df = pd.concat([train_neg, train_pos], axis=0).reset_index(drop=True)

# Load test subset
test_pos = pd.read_csv("dataset/test_Arabic_tweets_positive.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
test_pos.rename(columns={0:'label', 1:'tweet'}, inplace=True)
test_pos['label']=1

test_neg = pd.read_csv("dataset/test_Arabic_tweets_negative.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
test_neg.rename(columns={0:'label', 1:'tweet'}, inplace=True)
test_neg['label']=0

test_df = pd.concat([test_neg, test_pos], axis=0).reset_index(drop=True)

In [45]:
# create new dataset
dataset = DatasetDict({'train':Dataset.from_dict({'label':train_df['label'],'tweet':train_df['tweet']}),
                            'test':Dataset.from_dict({'label':test_df['label'],'tweet':test_df['tweet']})})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'tweet'],
        num_rows: 47000
    })
    test: Dataset({
        features: ['label', 'tweet'],
        num_rows: 11751
    })
})

In [46]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5080638297872341

## Model
- https://huggingface.co/arabi-elidrisi/ArabicDistilBERT

In [47]:
model_checkpoint = 'asafaya/bert-mini-arabic'

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mini-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# display architecture
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, e

## preprocess data

In [49]:
def remove_emojis(sent):
    text =  emoji.demojize(sent)
    text= re.sub(r'(:[!_\-\w]+:)', '', text)
    return text

def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """

    # Normalize unicode encoding
    text = unicodedata.normalize('NFC', text)
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    #Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '<URL>', text)
    
    # Remove emojis
    text = remove_emojis(text)

    return text

# dataset['train']["tweet"] = [text_preprocessing(text) for text in dataset['train']["tweet"]]
# dataset['test']["tweet"] = [text_preprocessing(text) for text in dataset['test']["tweet"]]

In [50]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# create tokenize function
def tokenize_df(df):
    # extract text
    texts = [text_preprocessing(text) for text in df["tweet"]]

    #tokenize and truncate text
    tokenizer.truncation_side = "right"
    tokenized_inputs = tokenizer(
        texts,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_df, batched=True)

# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 47000/47000 [00:03<00:00, 11828.94 examples/s]
Map: 100%|██████████| 11751/11751 [00:00<00:00, 14178.71 examples/s]


In [51]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'tweet', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 47000
    })
    test: Dataset({
        features: ['label', 'tweet', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11751
    })
})

## evaluation

In [52]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## Apply untrained model to text

In [53]:
# define list of examples
text_list = ["هذا رائع", "ليس جيداً بالمطلق", "يمكن تحسين الوضع.", "لنتأمل أن يتحسن", "لا أعرف بالضبط"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
هذا رائع - Negative
ليس جيداً بالمطلق - Negative
يمكن تحسين الوضع. - Negative
لنتأمل أن يتحسن - Negative
لا أعرف بالضبط - Negative


## Train model

In [54]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query'])

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 8,706 || all params: 11,558,148 || trainable%: 0.07532348608098806


In [55]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [56]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
# create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

In [64]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
هذا رائع - Positive
ليس جيداً بالمطلق - Negative
يمكن تحسين الوضع. - Negative
لنتأمل أن يتحسن - Negative
لا أعرف بالضبط - Negative
