In [1]:
!pip install transformers pyarabic
!git clone https://github.com/aub-mind/arabert 

Cloning into 'arabert'...
remote: Enumerating objects: 569, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 569 (delta 26), reused 24 (delta 23), pack-reused 535[K
Receiving objects: 100% (569/569), 9.12 MiB | 1.95 MiB/s, done.
Resolving deltas: 100% (327/327), done.


In [1]:
import pandas as pd
import seaborn as sns

import random

import torch
from torch.utils.data import Dataset

from transformers import GPT2TokenizerFast, pipeline
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer, AutoConfig

from arabert.preprocess import ArabertPreprocessor

2022-05-28 02:30:31.623204: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-28 02:30:31.623257: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
data = pd.read_csv('../datasets/abasi_ready_poems.csv').dropna()

In [4]:
MODEL_NAME='aubmindlab/aragpt2-base'

arabert_prep = ArabertPreprocessor(model_name=MODEL_NAME)

tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)

In [5]:
MAXLEN = 100

In [6]:
class MyDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.L = len(data)
        self.tokenizer = tokenizer
        tokenizer.pad_token = tokenizer.eos_token


    def __len__(self):
        return self.L
    
    
    def __getitem__(self, id):
        
        
        x = self.data.iloc[id]['poems']

        encodings_dict = self.tokenizer(
            x,
            truncation=True, 
            max_length=MAXLEN, 
            padding='max_length'
        )

        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)} 
    

In [7]:
def split_data(data, S=0.8):
    # Shuffle ids
    ids = list(range(len(data)))
    random.shuffle(ids)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    train_ids = ids[:train_size]
    val_ids = ids[train_size:]

    train_data = data.iloc[train_ids]
    val_data   = data.iloc[val_ids]

    return train_data, val_data

In [8]:
dataset = MyDataset(data, tokenizer)

## Model:

In [None]:
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, 
                                        pad_token_id=tokenizer.eos_token_id,
                                        output_hidden_states=False)

In [10]:
train_data, val_data = split_data(data)

train_dataset = MyDataset(train_data, tokenizer)
val_dataset   = MyDataset(val_data, tokenizer)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 39,553 samples for training, and 9,889 samples for validation testing'

In [16]:
# - Freeze selective layers:
UNFREEZE_LAST_N = 6

# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

NameError: name 'model' is not defined

In [None]:
model = model.cuda()

In [None]:
%%time

DEBUG           = True
USE_APEX        = False
APEX_OPT_LEVEL  = 'O1'


if USE_APEX:
    TRAIN_BATCHSIZE = 16
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020


training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # gradient_accumulation_steps=32,
    evaluation_strategy="epoch",
    # fp16=True,
    # fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    # load_best_model_at_end=True,     
)

# ---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)
#---------------------------------------------------#
trainer.train()
trainer.save_model()    

***** Running training *****
  Num examples = 39553
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9892


Epoch,Training Loss,Validation Loss


Saving model checkpoint to /content/checkpoint-500
Configuration saved in /content/checkpoint-500/config.json
Model weights saved in /content/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/checkpoint-1000
Configuration saved in /content/checkpoint-1000/config.json
Model weights saved in /content/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [/content/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to /content/checkpoint-1500
Configuration saved in /content/checkpoint-1500/config.json
Model weights saved in /content/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-1500/tokenizer_config.json


Epoch,Training Loss,Validation Loss
1,3.656,3.441757
2,3.3504,3.280804
3,3.1223,3.187619
4,2.9578,3.169036


Saving model checkpoint to /content/checkpoint-2500
Configuration saved in /content/checkpoint-2500/config.json
Model weights saved in /content/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-2500/tokenizer_config.json
Special tokens file saved in /content/checkpoint-2500/special_tokens_map.json
Deleting older checkpoint [/content/checkpoint-2000] due to args.save_total_limit
Saving model checkpoint to /content/checkpoint-3000
Configuration saved in /content/checkpoint-3000/config.json
Model weights saved in /content/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /content/checkpoint-3000/special_tokens_map.json
Deleting older checkpoint [/content/checkpoint-2500] due to args.save_total_limit
Saving model checkpoint to /content/checkpoint-3500
Configuration saved in /content/checkpoint-3500/config.json
Model weights saved in /content/checkpoint-3500/pytorch_

CPU times: user 2h 21min 50s, sys: 1min 4s, total: 2h 22min 54s
Wall time: 2h 23min 17s


In [13]:
config = AutoConfig.from_pretrained(MODEL_NAME)

In [15]:
model = GPT2LMHeadModel(config=config)

In [16]:
model.load_state_dict(torch.load('../../models/generation/aragpt2.pt', map_location=device))

<All keys matched successfully>

In [17]:
from io import TextIOBase
generation_pipeline = pipeline("text-generation",model=model,tokenizer=tokenizer, device=-1)

In [31]:
def generate_sentence(meter, rhyme, max_lines=100, max_length=250, start_with='', num_beams=5):
    """
    Generate poem given a meter, rhyme and a start off text
    Input:
        meter (str): the Bahr
        rhyme (str): the rhyme or 7arf rawi
        max_lines (int): the max number of verses in the generated poems
        max_length (int): the max number of tokens in the generated poems
        start with (str): a text to start with
    Output (str):
        return the generated text
    """

    start_with = start_with.strip()
    x = f'<|endoftext|>{meter}<|endoftext|>{rhyme}<|endoftext|>{start_with}'
    out = generation_pipeline(x,
                              # pad_token_id=tokenizer.eos_token_id,
                              num_beams=num_beams,
                              max_length=max_length,
                              top_p=0.9,
                              repetition_penalty=3.0,
                              no_repeat_ngram_size=2, device=0)[0]

    out = out['generated_text'][46:]

    i = 0
    res = []
    for line in out.split(' * '):
        if '_' in line:
            r, l = line.split(' _ ')
            r, l = r.strip(), l.strip()
            res += [r + ' _ ' + l]
        else:
            res += [line]
        i += 1
        if i == max_lines:
            break
    
    res[-1] = res[-1] + rhyme

    return '\n'.join(res)

In [None]:
ryhmes = ['ء', 'ب', 'ت', 'ح', 'د', 'ر', 'ع', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'س', 'ض', 'ي', 'ز', 'ش', 'ا', 'خ', 'و', 'ج', 'ص', 'ط', 'ث', 'ذ', 'ظ', 'غ', 'ى', 'هـ']
generation_meters = ['الخفيف', 'الكامل', 'الطويل', 'البسيط', 'الوافر', 'السريع', 'الرمل', 'المتقارب', 'الرجز']

In [29]:
out = generate_sentence(meter='الكامل', rhyme='ر')
print(out)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


أحسانه في كل يوم عيده _ من قبل أن يزهو به السرور
لم يبق إلا جنة خلدت بها _ زهر الأماني نضرة ونضير
يا أيها المولى الذي بعلائه _ لا زال هذا الدهر وهو كبير
وافى الهنا بشرى البشير ببشره _ بقدوم مولود سعيد بشير
فاهنأ بطلعته السعيدة قد زهت _ ببهائها وازدهت بشذاه زهور
وتبسمت البشرى بميسم ثغرها _ وبوجهه الزاهي السعيد سرور
والبشر أسفر بالسرور وأشرقت _ أنواره ابتهاجا وزهت البدور
وبدا بأفق السعد يشرق نوره _ وتزينت بقدومه الصدو


In [33]:
out = generate_sentence(meter='البسيط', rhyme='ن')
print(out)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


ومن مجيري من لوعاني _ ومن لصب على الجفا جفاني
يا مهجتي كيف لا تسليني _ وكيف أسلو الذي أهواه في دنان
لم أنس يوم النوى وصبابة _ قد كان لي فيه صبابات وأحزان
أبيت أرعى النجم أشواقا وأعينه _ بين الغوير وبين البان يقظان
حتى إذا ما استباحت عيناي طلعته _ رأيت بدر الدجى ليلا ببرهان
فيا لهف قلبي هل يعود لنا _ طيف يزور خيال زار بعد تولاني
لله أيام كنا نحسو بها قمرا _ والشمس ردت علينا طلعة الشهبوان
نإن قلت ليل التصابي ليسعدني _ وإن يكن بيننا عن كل إنس


In [27]:
out = generate_sentence(meter='البسيط', rhyme='ر')
print(out)

أقسمت في الهوى ما عشت من وطر _ يا ويحه لم يزل يسطو على خطر
لا تعجبن إذا ما كنت ذا شغف _ ولا تغريني وإن طال المدى قصر
إني امرؤ قد عرفت الناس أجمعهم _ وليس يعلم إلا الله من قدر
إن الحياة حياة غير واحدة _ والموت فيها سوى اللذات والبشر
وما الحياة سوى أحلام مؤتلق _ وآونة النفس بالآمال والزهر
فلا يغرنك شيء غير متئد _ كلا ولا بد للإنسان من أثر
لكنما هذه الدنيا سراب وغى _ وعيشة العيش بين اليأس والكدر
أما ترى المرء عيشا والحياة فما _ يرجى إلى غاية أو لذة الخ
