In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from time import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
data = pd.read_csv('../datasets/Arabic_poem_reduced.csv').dropna()

In [None]:
vocab = [' ', '!', '"', '(', ')', '*', ',', '-', '.', ':', '?', '_', '«', '»', '،', '؛', '؟', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي', 'ٍ', '–', '…']

vocab2token = { vocab[i]:i+1    for i in range(len(vocab)) }
token2vocab = { i+1:vocab[i]    for i in range(len(vocab)) }

def sentence2tokens(s):
    return [vocab2token[c] if c in vocab else len(vocab) for c in s]

def tokens2sentence(t):
    return [token2vocab[c] for c in t]

special = ['َ', 'ُ', 'ِ', 'ً', 'ٌ', 'ٍ', 'َّ', 'ُّ', 'ِّ','ًّ', 'ٌّ', 'ٍّ', 'ْ','ّ']
sps_dict = {}
for i, c in enumerate(special):
    sps_dict[c] = i
    
def get_sentence_tachkil(sentence):
    """
        Get the letters, tachkil of a sentence along with the tachkil ratio of the sentence
        Input:
            sentence (str)
        Output:
            S list(str): letter
            T list(str): tachkilat
    """

    sentence = list(sentence)
    
    T = []
    S = []
    
    cnt = 0

    i, j = 0 ,1
    L = len(sentence)

    while i < L - 1:
        if sentence[i] == ' ':
            S.append(' ')
            T.append(-1)
            i += 1
            j += 1

        else:
            S.append(sentence[i])
            if sentence[j] not in special:
                T.append(-1)
                i += 1
                j += 1
            
            elif sentence[j] != 'ّ':
                T.append(sps_dict[sentence[j]])
                cnt += 1
                i += 2
                j += 2
            
            else: # sentence[j] = 'ّ'
                if  j == L-1 or sentence[j+1] not in special: 
                    T.append(sps_dict['ّ'])
                    cnt += 1
                    i += 2
                    j += 2
                else:
                    try:
                        T.append(sps_dict['ّ' + sentence[j+1]])
                    except:
                        T.append(sps_dict['ّ'])
                    cnt += 1
                    i += 3
                    j += 3
                    

    if i < L:
        if sentence[-1] in special:
            T.append(sps_dict[sentence[-1]])
            cnt += 1
        else:
            S.append(sentence[-1])
            T.append(-1)

    return S, T, cnt / len(S)


def rebuild_sentence(S, T):
    res = ''
    for c, t in zip(S, T):
        if t == 14:
            res  += c
        else:
            res = res + c + special[t]

    return res

In [20]:
# We only take the sentences that have a tachkil ratio that is superior than the threshold tshd=0.55

sentence = []
tachkils = []
tshd = 0.55

for i in range(len(data)):

    if i % 100000 == 0:
        print(i, len(data), len(sentence))

    line = data.iloc[i]
    right, left = line['الشطر الايمن'], line['الشطر الايسر']
    
    try:
        clean_r, tachkil_r, ratio_r  = get_sentence_tachkil(right)
        if ratio_r > tshd:
            sentence.append(clean_r)
            tachkils.append(tachkil_r)
    except:
        print('1', i)
        
    try:
        clean_l, tachkil_l, ratio_l  = get_sentence_tachkil(left)
        if ratio_l > tshd:
            sentence.append(clean_l)
            tachkils.append(tachkil_l)
    except:
        print('2', i)
    

0 1684668 0
100000 1684668 15891
200000 1684668 30383


KeyboardInterrupt: 

In [21]:
filtered_dataset = pd.DataFrame()
filtered_dataset['source'] = sentence
filtered_dataset['target'] = tachkils

In [22]:
class MyDataset(Dataset):

    def __init__(self, data):
        self.data = data
        self.L = len(data)


    def __len__(self):
        return self.L

    def __getitem__(self, id):

        line = self.data.iloc[id]
        x_ = line['source']
        y = line['target']

        x = sentence2tokens(x_)

        x = torch.tensor(x, dtype=torch.int64)
        y = torch.tensor(y, dtype=torch.int64)
        
        y[y == -1] = 14
            
        return {
                'original':x_,
                'target':y,
                'input':x,
            }


In [23]:
def collate_fn(batch, maxlen=50):
    inputs, targets = [], []
    
    for line in batch:
        inputs.append(line['input'])
        targets.append(line['target'])

    
    if len(max(inputs, key=lambda x:len(x))) <= maxlen:
        T = inputs[0]
        inputs[0]  = nn.ConstantPad1d((0, maxlen - T.shape[0]), 0)(T)
        T = targets[0]
        targets[0] = nn.ConstantPad1d((0, maxlen - T.shape[0]), -1)(T)
        
    inputs    = pad_sequence(inputs,  padding_value=0  , batch_first=True)
    targets   = pad_sequence(targets, padding_value=-1 , batch_first=True)
    
    return inputs, targets

In [24]:
def split_data(data, s=0.7):
    import random
    ids = list(range(len(data)))
    random.shuffle(ids)
    x = int(len(ids)*s)
    return data.iloc[ids[:x]], data.iloc[ids[x:]]

In [25]:
train_data, val_data = split_data(filtered_dataset)

train_dataset = MyDataset(train_data)
val_dataset   = MyDataset(val_data)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 24,126 samples for training, and 10,341 samples for validation testing'

In [26]:
TRAIN_BATCH_SIZE = 128

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True, 
    collate_fn=collate_fn, 
#     prefetch_factor=1, 
#     num_workers=1
    )

val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=128,
    shuffle=True, 
    collate_fn=collate_fn, 
#     prefetch_factor=1, 
#     num_workers=1
    )

## Model:

In [27]:
class LstmModel(nn.Module):
    def __init__(
        self,
        emb_dim,
        vocab_size,
        output_size,
    ):
        super(LstmModel, self).__init__()
        self.vocab_size  = vocab_size
        self.output_size = output_size

        self.embdding     = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim, padding_idx=0)
        self.pos_embdding = nn.Embedding(num_embeddings=15, embedding_dim=emb_dim, padding_idx=0)

        self.lstm_layer_1 = nn.LSTM(
            input_size=emb_dim,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )

        self.lstm_layer_2 = nn.LSTM(
            input_size=self.lstm_layer_1.hidden_size * 2,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )

        self.lstm_layer_3 = nn.LSTM(
            input_size=self.lstm_layer_2.hidden_size * 2,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )


        self.fc1 = nn.Linear(self.lstm_layer_3.hidden_size * 2, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, output_size)
        

        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.5)


    def forward(self, src):

        emb     = self.embdding(src)

        lstm_1_seq, (lstm_1_h, lstm1_c) = self.lstm_layer_1(emb)
        lstm_2_seq, (lstm_2_h, lstm2_c) = self.lstm_layer_2(lstm_1_seq)
        lstm_3_seq, (lstm_3_h, lstm3_c) = self.lstm_layer_3(lstm_2_seq)

        out = self.dropout1(F.relu(self.fc1(lstm_3_seq)))
        out = self.dropout2(F.relu(self.fc2(out)))
        out = self.fc3(out)

        return out

# Training

In [28]:
def train_step(model, src, tgt, optimizer, scheduler, loss_fn, max_norm=0.9):
    model.train()
    model.zero_grad()

    src = src.to(device)

    out = model(src)
    
    out = out.reshape(-1,OUTPUT_SIZE)
    tgt = tgt.reshape(-1).to(device)
    loss = loss_fn(out, tgt)

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
    optimizer.step()
    if scheduler:
        scheduler.step()

    return loss.item()

In [29]:
def eval_model(model, dataloader, loss_fn, log_steps = 5):
    model.eval()
    eval_loss = 0
    i = 0
    L_train_dataloader = len(dataloader)
    
    for src, tgt in dataloader:
        
        src = src.to(device)

        out = model(src)

        out = out.reshape(-1,OUTPUT_SIZE)
        tgt = tgt.reshape(-1).to(device)
        loss = loss_fn(out, tgt)
        eval_loss += loss.item()

        i += 1
        if i % ( L_train_dataloader / log_steps) < 1:
            print(f'        EVAL STEP: {i} / {L_train_dataloader}')

    return eval_loss / i




def calculate_accuracy(model, dataloader):
    model.eval()

    with torch.no_grad():
        a, r = 0, 0
        j = 0
        for src, tgt in dataloader:

            src = src.to(device)

            out = model(src).detach().cpu()

            out = out.reshape(-1,OUTPUT_SIZE)
            tgt = tgt.reshape(-1)

            out = torch.argmax(out, dim=-1)
            
            out = out[tgt != -1]
            tgt = tgt[tgt != -1]
            
            a += len(tgt)
            r += sum(tgt == out).item()
            
            j += 1
            
            if j % (len(dataloader) // 5) == 0:
                print(f'ACC STEP {j}/{len(dataloader)}')

    return (r / a)

# Training:

In [30]:
VOCAB_SIZE  = len(vocab)+1 # 0 is for the padding 
OUTPUT_SIZE = len(special)+1 # +1 for the non tachkil

model = LstmModel(
    256,
    VOCAB_SIZE, 
    OUTPUT_SIZE).to(device)

n_model = 3

optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
EPOCH = 30
log_steps = 10
L_train_dataloader = len(train_dataloader)
losses = []
eval_losses = []
lrs = []
max_val_acc = 0
val_acc = 0

src, tgt =  next(iter(train_dataloader))

for epoch in range(0, EPOCH+1):
    print(f'EPOCH: {epoch} START TRAINING ... ----------------------------------------------------------------------')
    epoch_loss = 0
    steps_loss = 0
    i, j = 0, 0
    s_time = time()
    # for _  in train_dataloader:
    for src, tgt in train_dataloader:
        i, j = i+1, j+1
        loss = train_step(model, src, tgt, optimizer, None, loss_fn)
        losses.append(loss)
        epoch_loss += loss
        steps_loss += loss
        if i % ( L_train_dataloader / log_steps) < 1:
            lr = optimizer.param_groups[0]['lr']
            lrs.append(lr)
            print(f'EPOCH: {epoch}    STEP: {i} / {L_train_dataloader}   STEP LOSS: {steps_loss / j:.5f}   TIME: {time() - s_time:.4f}    LR: {lr:.6f}')
            s_time = time()
            j = 0
            steps_loss = 0
    
    print('EVALUATION :')
    
    if (epoch + 1) % 1 == 0:
        val_acc = calculate_accuracy(model, val_dataloader)
    print(f'EPOCH: {epoch}    TRAINING LOSS: {epoch_loss / i:.5f}   VAL ACC: {val_acc*100:.3f}')


    if val_acc > max_val_acc:
        print(f'SAVING BEST MODEL {n_model}')
        max_val_acc = val_acc
        torch.save(model.state_dict(), f'best_lstm_model_{str(n_model)}.pt')

    print('----------------------------------------------------------------------------------------------------', end='\n\n')

# Evaluation:

In [31]:
model.load_state_dict(torch.load('../../models/tachkil/tachkil_model.pt', map_location=device))

<All keys matched successfully>

In [None]:
model.eval()

tgts = []
outs = []

with torch.no_grad():
    j = 0
    for src, tgt in val_dataloader:

        src = src.to(device)

        out = model(src).detach().cpu()
        

        out = out.reshape(-1,OUTPUT_SIZE)
        tgt = tgt.reshape(-1)

        out = torch.argmax(out, dim=-1)

        out = out[tgt != -1]
        tgt = tgt[tgt != -1]
        
        out = list(out)
        tgt = list(tgt)
        
        tgts += tgt
        outs += out

        j += 1

        if j % (len(val_dataloader) // 5) == 0:
            print(f'ACC STEP {j}/{len(val_dataloader)}')


In [None]:
outs = [c.item() for c in outs]
tgts = [c.item() for c in tgts]

In [None]:
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay

print(accuracy_score(tgts, outs))

fig, ax = plt.subplots(figsize=(20,20))
ConfusionMatrixDisplay.from_predictions(tgts, outs, ax=ax, normalize='true')

In [None]:
poem = """قُم فَاِسقِنيها قَبلَ صَوتِ الحَمام
كَرمِيَّةً تَجمَعُ شَملَ الكِرام
صَهباءَ مِمّا عَتَّقَت بابِلٌ
مِزاجُها الأَريُ وَماءُ الغَمام
مِمّا أُدِيرَ الكَأسُ مِنها عَلى
كِسرى وَنُمرُوذَ بنِ كُوشِ بنِ حام
لَوِ اِحتَساها اِبنُ الزُبَيرِ اِغتَدى
أَكرَمَ مِن كَعبٍ وَأَوسِ بنِ لام
تَذهَبُ بِاليَأسِ وَتُدني المُنى
وَتَنشُرُ اللَهوَ وَتَطوي الغَرام
أَو ذاقَها المَنزُوفُ ضَرطاً لَما
هابَ اِبنَ ذي الجَدَّينِ يَومَ الزِحام
وَحَرِّكِ الأَوتارَ وَاِذكُر لَنا
أَيّامنا الغُرَّ بَدارِ السَلام
وَتِلكُمُ الغُزلانُ بَينَ المَها
تَمشي إِلى الشَطِّ فِئاماً فِئام
مِن كُلِّ أَظمى فاتِرٍ طَرفُهُ
تُروى بِمَرآهُ القُلوبُ الحِيام""".strip().split('\n')

In [None]:
for line in poem:
    S, T, _ = get_sentence_tachkil(line)
    
    x = sentence2tokens(S)
    x = torch.tensor(x, dtype=torch.int64)
    L = len(x)
    x  = nn.ConstantPad1d((0, 50 - x.shape[0]), 0)(x)
    x = x.unsqueeze(0).to(device)
    
    out = model(x)
    out = torch.argmax(out, dim=-1)[0,:L].cpu()
    out = list(np.array(out))
    
    print(f'Predicted: {rebuild_sentence(S, out)}')
    print(f'Original : {line}')
    print()