# Modeling & Training
## AraBERT

In [None]:
!pip install transformers==4.12.2
!pip install farasapy==0.0.14
!pip install pyarabic==0.6.14
!git clone https://github.com/aub-mind/arabert
!pip install emoji==1.6.1
!pip install sentencepiece==0.1.96

In [None]:
import os
import time
import math
import copy
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import date
from transformers import *
from sklearn.metrics import *
from tqdm import tqdm_notebook as tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import RandomSampler

from arabert.preprocess import ArabertPreprocessor

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, BertTokenizer, Trainer,
                          TrainingArguments)
from transformers.data.processors.utils import InputFeatures

os.environ['WANDB_DISABLED'] = 'true'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
# # Expect an update for supporting TPU training using Pytorch/XLA

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.    
    device = torch.device('cuda')

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...from sklearn.model_selection import train_test_split
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

In [None]:
class CustomDataset:
    def __init__(self, train, test, label_list):
        '''Class to hold and structure datasets.

        Args:

        train (List[pd.DataFrame]): holds training pandas dataframe with 2 columns ['text','label']
        test (List[pd.DataFrame]): holds testing pandas dataframe with 2 columns ['text','label']
        label_list (List[str]): holds the list  of labels
        '''
        self.train = train
        self.test = test
        self.label_list = label_list

In [None]:
try:
    rawData = pd.read_json('arabicDialects.json', encoding='utf-8',
                           typ='series', convert_axes=False)
except Exception as e:
    %run -i fetch.py
    rawData = pd.read_json('arabicDialects.json', encoding='utf-8',
                           typ='series', convert_axes=False)

rawData.index = rawData.index.astype('int64')

datasetIds = pd.read_csv('dialect_dataset.csv', index_col='id')
datasetIds = datasetIds.loc[rawData.index]
datasetIds['text'] = rawData

dialects = sorted(datasetIds.dialect.unique())
label_map = dict(zip(dialects, range(len(dialects))))
inv_label_map = dict(enumerate(dialects))

In [None]:
DATA_COLUMN, LABEL_COLUMN = 'text', 'dialect'

print('Total length: ', len(rawData))
print(datasetIds[LABEL_COLUMN].value_counts())

train_data, test_data = train_test_split(datasetIds, test_size=.1, random_state=42)
print('Training length: ', len(train_data))
print('Testing length: ', len(test_data))

dialects_dataset = CustomDataset(train_data, test_data, dialects)

In [None]:
# https://huggingface.co/aubmindlab/bert-base-arabertv02/blob/main/pytorch_model.bin
AraBert_v02_Pytorch = 'aubmindlab/bert-base-arabertv02'

arabic_prep = ArabertPreprocessor(AraBert_v02_Pytorch)

In [None]:
dialects_dataset.train.loc[:,DATA_COLUMN] = \
    dialects_dataset.train.loc[:,DATA_COLUMN].apply(lambda x: arabic_prep.preprocess(x))
dialects_dataset.test.loc[:,DATA_COLUMN] = \
    dialects_dataset.test.loc[:,DATA_COLUMN].apply(lambda x: arabic_prep.preprocess(x))

In [None]:
max_len = 90
tok = AutoTokenizer.from_pretrained(AraBert_v02_Pytorch)

In [None]:
train_len_hist = [len(tok.tokenize(sentence)) for sentence in \
                  dialects_dataset.train[DATA_COLUMN]]
test_len_hist = [len(tok.tokenize(sentence)) for sentence in \
                 dialects_dataset.test[DATA_COLUMN]]

In [None]:
print('Training Sentence Lengths: ')
plt.hist(train_len_hist,bins=range(0,max_len,2))
ax = plt.gca()
plt.vlines(max_len, *ax.get_ylim(), colors='red')
plt.show()

print('Testing Sentence Lengths: ')
plt.hist(test_len_hist,bins=range(0,max_len,2))
ax = plt.gca()
plt.vlines(max_len, *ax.get_ylim(), colors='red')
plt.show()

In [None]:
print(f'At max len of {max_len}, there are:')

trunc_train_seq = len(list(filter(lambda x:x>max_len, train_len_hist)))
print(f'Truncated training sequences: {trunc_train_seq} / {len(train_data)} ~ '
      f'{trunc_train_seq/len(train_data):.2f}%')

trunc_test_seq = len(list(filter(lambda x:x>max_len, test_len_hist)))
print(f'Truncated testing sequences: {trunc_test_seq} / {len(test_data)} ~ '
      f'{trunc_test_seq/len(test_data):.2f}%')

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(ClassificationDataset).__init__()
        '''
        Args:
        text (List[str]): List of the training text
        target (List[str]): List of the training labels
        tokenizer_name (str): The tokenizer name (same as model_name).
        max_len (int): Maximum sentence length
        label_map (Dict[str,int]): A dictionary that maps the class labels to integer
        '''
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map


    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = ' '.join(text.split())

        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )      
        return InputFeatures(**inputs,label=self.label_map[self.target[item]])

In [None]:
label_map = dict(zip(dialects, range(len(dialects))))
print(label_map)

train_dataset = ClassificationDataset(
    dialects_dataset.train[DATA_COLUMN].to_list(),
    dialects_dataset.train[LABEL_COLUMN].to_list(),
    AraBert_v02_Pytorch,
    max_len,
    label_map
  )

test_dataset = ClassificationDataset(
    dialects_dataset.test[DATA_COLUMN].to_list(),
    dialects_dataset.test[LABEL_COLUMN].to_list(),
    AraBert_v02_Pytorch,
    max_len,
    label_map
  )

In [None]:
def model_init():
    return AutoModelForSequenceClassification. \
        from_pretrained(AraBert_v02_Pytorch, return_dict=True, num_labels=len(label_map))

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark = False
    
def compute_metrics(p): #p should be of type EvalPrediction
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    
    macro_f1 = f1_score(p.label_ids, preds, average='macro')
    
    acc = accuracy_score(p.label_ids, preds)
    return {
      'macro_f1' : macro_f1,
      'accuracy': acc
    }

In [None]:
training_args = TrainingArguments( 
    output_dir= './train',    
    adam_epsilon = 1e-8,
    learning_rate = 2e-5,
    fp16 = False, # enable this when using V100 or T4 GPU # Kaggle runs on P100
    per_device_train_batch_size = 64, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs = 1,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, 
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 42
  )

set_seed(training_args.seed)

In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics,
)

In [None]:
#start the training
trainer.train()

In [None]:
inv_label_map = dict(enumerate(dialects))
print(inv_label_map)

trainer.model.config.label2id = label_map
trainer.model.config.id2label = inv_label_map
trainer.save_model('output_dir')
train_dataset.tokenizer.save_pretrained('output_dir')

In [None]:
!tar -czvf ArabiceDialect_BERT.tar.gz output_dir/*

In [None]:
pipe = pipeline('sentiment-analysis', model=f'output_dir/', 
                device=0, return_all_scores =True, max_length=max_len, 
                truncation=True)

In [None]:
preds = pipe('يسطا رحت فين')
sorted([(x['score'], x['label']) for x in preds[0]], reverse=True)