<h2>Text Classification Training</h2> 

In [1]:
# Main libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from matplotlib import pyplot as plt
import os, re, shutil

In [None]:
# Function to read csv inside zip file for pandas
def extract_df(url):
    # Necessary packages
    import re, requests, shutil, zipfile, os
    import pandas as pd
    # The url where we can fide the file
    url_df = url
    # The name of the zip file
    file_df = re.findall(r'([^\/]+$)', url)[0]
    if '.zip' not in file_df:
        files_dir = file_df+'.zip'
    else:
        files_dir = file_df
    if not os.path.exists(files_dir):
        os.mkdir(files_dir)
    # Command to donwload the file at the given url
    r = requests.get(url_df)
    # Then we open the file
    open(file_df, 'wb').write(r.content)
    # We extract the content of the .zip file
    with zipfile.ZipFile(file_df, 'r') as unzip:
        unzip.extractall(files_dir)
     # we finally read the csv and make some cleaning
    df = pd.read_csv(os.path.join(
        files_dir, [i for i in unzip.namelist() if i.endswith('.csv')][0]))
    # We remove files/dir
    shutil.rmtree(files_dir)
    os.remove(file_df)
    return df

In [4]:
# Main dataframe
df = extract_df('https://data.mendeley.com/public-files/datasets/v524p5dhpj/files/72c2e306-9538-4c74-a28f-558fbe87c382/file_downloaded').rename(columns={'targe':'target'})

# Train test split
texts = df['text']
target = df['target']
text_train, text_test, target_train, target_test = train_test_split(texts, target, test_size=.2, random_state=23)

# # Or with datasets
# dataset = Dataset.from_pandas(df).train_test_split(test_size=.2)

<h4>Build a tokenizer</h5>

<p>Main files for the tokenizer

In [3]:
# # Create dir for the files (for tokenizer creation)
try:
    shutil.rmtree('files')
    os.mkdir('files')
except:
    print('File already exists')

for i, text in enumerate(tqdm(texts)):
    with open(f'files/text_{i}.txt', 'w', encoding='utf-8') as f:
        f.write(text)

In [None]:
# Function to get the files for the tokenizer
get_files = lambda directory : [file for dir,_,files in os.walk(directory) for file in files]
files = get_files(r'files/')
# For sorting
files.sort(key=lambda f: int(re.sub('\D', '', f)))
# Append the directory to the files
files = [f'files/{i}' for i in files]

# # # Once done
# # shutil.rmtree('files')

<p>Main tokenizer

In [4]:
# ~ 8 minutes of training
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=files, special_tokens=['<s>','<pad>','</s>','<unk>','<mask>'])

# Save the model
try:
    os.mkdir('3arabert')
except:
    print('Folder already exists')
tokenizer.save_model('3arabert')

Folder already exists


['3arabert\\vocab.json', '3arabert\\merges.txt']

In [5]:
# Import the generated tokenizer
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('3arabert')

# Test
tokenizer.encode('السلام عليكم'), tokenizer.decode([0, 265, 1426, 18526, 2]), tokenizer.encode('سبحان الله'), tokenizer.decode([0, 29768, 299, 1184, 2])

file 3arabert\config.json not found
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
file 3arabert\config.json not found
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


([0, 265, 1426, 18526, 2],
 '<s>السلام عليكم</s>',
 [0, 29768, 299, 1184, 2],
 '<s>سبحان الله</s>')

<h4>Masked language modeling

In [6]:
import torch
torch.cuda.empty_cache()

# Masked language model function for torch
def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero())
        tensor[i, selection] = 4
    return tensor


In [8]:
# ~ 4 minutes
input_ids = []
attention_mask = []
labels = []

for file in tqdm(files):
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    attention_mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))


100%|██████████| 105586/105586 [03:55<00:00, 448.99it/s]


<h5>Concatenate the tensors into one tensor for all (input ids, attention masks and lables)

In [9]:
final_input_ids = torch.cat(input_ids)
final_attention_mask = torch.cat(attention_mask)
final_labels = torch.cat(labels)

tokenizer.decode(final_labels[0])

'<s>بين أستوديوهات ورزازات وصحراء مرزوكة وآثار وليلي ثم الرباط والبيضاء انتهى المخرج المغربي سهيل بن بركة من تصوير مشاهد عمله السينمائي الجديد الذي خصصه لتسليط الضوء عن حياة الجاسوس الإسباني دومينغو باديا الذي عاش فترة من القرن التاسع عشر بالمغرب باسم علي باي هذا الفيلم الذي اختار له مخرجه عنوان حلم خليفة يصور حياة علي باي العباسي الذي ما زال أحد أحياء طنجة يحمل اسمه عاش حياة فريدة متنكرا بشخصية تاجر عربي من سلالة الرسول صلى الله عليه وسلم فيما كان يعمل جاسوسا لحساب إسبانيا وكشف مخرج الفيلم سهيل بن بركة في تصريح لهسبريس أن الفيلم السينمائي دخل مرحلة التوضيب التي تتم خارج المغرب مبرزا أن الفيلم الذي يروي حياة الجاسوس الإسباني دومينغو باديا منذ أن قرر من طنجة بدء رحلاته نحو عدد من المناطق في العالم الإسلامي بداية القرن العشرين سيكون جاهزا بعد شهرين ويجمع الفيلم السينمائي عددا من الممثلين من مختلف الجنسيات واختار لدور البطولة الممثلة السينمائية الإيطالية كارولينا كريشنتيني للقيام بدور الإنجليزية الليدي هستر ستانهوب التي اشتهرت في الكتب الغربية بـ زنوبيا والتي عاشت بدورها بالدول العربية وا

In [10]:
# Main encodings
encodings = {
    'input_ids': final_input_ids,
    'attention_mask': final_attention_mask,
    'labels': final_labels
}

# Dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

# Main dataset
dataset=Dataset(encodings=encodings)
dataset
# tokenizer.decode(dataset[3]['input_ids'])

# Dataloader for batch and sampling...
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True) # reducing batch size for the gpu memory allocation error
tokenizer.decode(dataloader.dataset[1]['labels'])

'<s>قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر عملها على الفن بل عملت مع أحد المتخصصين لإطلاق نوع جديد من الشاي سيصبح متوفرا ابتداء من الشهر المقبل في سلسلة مقاهي ستاربكس ونقلت وسائل إعلام أمريكية عن رئيس مجلس إدارة ستاربكس هاورد شولتز ووينفري إعلانهما عن ابتكار نوع جديد من الشاي يحمل اسم الذي سيباع ابتداء من أبريل نيسان المقبل في مقاهي ستاربكس وتيفانا بأمريكا وكندا وتعتزم ستاربكس التبرع بعائدات بيع هذا الشاي لأكاديمية أسستها وينفري وتعنى بتوفير فرص تعليم للشبان</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

<h3>Training the model</h3>
<h5>Setup the config

In [11]:
torch.cuda.empty_cache()

from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size = tokenizer.vocab_size,
    max_position_embeddings = 514,
    hidden_size = 768,
    num_attention_heads = 12,
    num_hidden_layers = 6, # Deep learning layers
    type_vocab_size=1 
)

<h5>Initialize the model<h5>
In case of an error when using cuda, just switch to CPU then again to Cuda

In [None]:
print(torch.cuda.memory_summary())
# !nvidia-smi

In [None]:
# Using the GPU
torch.cuda.empty_cache()
# device = torch.device('cpu')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Main model
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
model.to(device)

In [14]:
# Typical Adam optimiser with a learning rate of 1e-5
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)



In [15]:
# Epochs --- ~ 1 / 2 hours
epochs = 2
print(dataloader.batch_size)
# Loop instantiation for dataloader
loop = tqdm(dataloader, leave=True)
for batch in loop:
    # Reset the gradients explicitely after each loop
    optimizer.zero_grad()
    final_input_ids = batch['input_ids'].to(device)
    final_attention_mask = batch['attention_mask'].to(device)
    final_labels = batch['labels'].to(device)
    output = model(input_ids=final_input_ids, 
                   attention_mask=final_attention_mask,
                   labels=final_labels)
    loss = output.loss
    # For backpropagation
    loss.backward()
    optimizer.step()
    
    # To view the progress
    # loop.set_description(f'Epoch: {epoch}')
    # loop.set_postfix(loss=loss.items())
    


4


100%|██████████| 26397/26397 [1:37:52<00:00,  4.49it/s]


In [30]:
# # Save the model after training
# model.save_pretrained('3arabert')

<h4>Use the model

In [31]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
tkzr = RobertaTokenizer.from_pretrained('3arabert',max_len=512)
mdl = RobertaForMaskedLM.from_pretrained('3arabert')

In [89]:
from transformers import pipeline
test = pipeline('text-classification', model='3arabert', tokenizer='3arabert')

Some weights of the model checkpoint at 3arabert were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at 3arabert and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably T

In [90]:
test(f'هل قمت بذلك أم أنت لست على هواك؟ كيف يمكنك فعل ذلك؟؟')

[{'label': 'LABEL_0', 'score': 0.53694087266922}]

In [137]:
# Tokenizer & Model
from transformers import GPT2TokenizerFast, pipeline

MODEL_NAME='aubmindlab/aragpt2-base'
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)

In [135]:
# tokenizer = Tokenizer(num_words=None,lower=False)
# tokenizer.fit_on_texts(text)

In [136]:
x_train = tokenizer.texts_to_matrix(text_train, mode='tfidf')
x_test = tokenizer.texts_to_matrix(target_train, mode='tfidf')

MemoryError: Unable to allocate 267. GiB for an array with shape (84468, 423535) and data type float64

In [None]:
# from transformers import AutoModel
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# # Tokenizer & Model
# from transformers import GPT2TokenizerFast, pipeline

# MODEL_NAME='aubmindlab/aragpt2-base'
# tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)