In [1]:
import pandas as pd
import re

In [24]:
import json

with open("clean_shape_to_base.json", encoding="utf-8") as f:
    shape_to_base = json.load(f)

In [2]:
df = pd.read_csv("dataset/Qabas-dataset.csv")
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,augmentation,number,person,gender,voice,transitivity,uninflected
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,,مفرد,,مذكر,,,
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,,مفرد,,مذكر,,,
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,,مفرد,,مذكر,,,
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,,مفرد,,مذكر,,,
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,,مفرد,,مذكر,,,


In [3]:
# Step 1: Drop duplicate rows
df.drop_duplicates(inplace=True)

# Step 2: Normalize text: remove diacritics and unify character variants
def normalize_arabic(text):
    if pd.isnull(text):
        return text
    text = re.sub(r'[\u064B-\u0652]', '', text)  # Remove diacritics
    text = text.replace('ى', 'ي').replace('ة', 'ه')  # Normalize characters
    return text

df['lemma'] = df['lemma'].astype(str).apply(normalize_arabic)
df['root'] = df['root'].astype(str).apply(normalize_arabic)

In [4]:
# Step 3: Filter relevant columns for multitask learning
columns_to_use = ['lemma', 'pos_cat', 'pos', 'root', 'number', 'gender']
df = df[columns_to_use]

# Step 4: Drop rows with missing target labels
df_cleaned = df.dropna(subset=columns_to_use[1:]) 

In [5]:
from sklearn.preprocessing import LabelEncoder

# Define the columns to encode (targets)
target_columns = ['pos_cat', 'pos', 'root', 'number', 'gender']

# Create a dictionary to store the encoders and class mappings
label_encoders = {}
label_classes = {}

# Apply label encoding to each target column
for column in target_columns:
    le = LabelEncoder()
    df_cleaned[column + '_encoded'] = le.fit_transform(df_cleaned[column])
    label_encoders[column] = le
    label_classes[column] = le.classes_.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[column + '_encoded'] = le.fit_transform(df_cleaned[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[column + '_encoded'] = le.fit_transform(df_cleaned[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[column + '_encoded'] = le.fit_transform(df_cleaned[c

In [6]:
from sklearn.model_selection import train_test_split

# We'll stratify based on 'pos_cat_encoded' to maintain label distribution
stratify_label = df_cleaned['pos_cat_encoded']

# Step 1: Train + Temp (Validation + Test)
train_df, temp_df = train_test_split(
    df_cleaned,
    test_size=0.2,
    stratify=stratify_label,
    random_state=42
)

# Step 2: Split Temp into Validation and Test (10% each)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['pos_cat_encoded'],
    random_state=42
)

# Sanity check: print sizes
print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 35710
Validation size: 4464
Test size: 4464


In [7]:
import torch
import torch.nn as nn

class MultitaskArabicModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, max_seq_len,
                 num_pos_cat, num_pos, num_root, num_number, num_gender):
        super(MultitaskArabicModel, self).__init__()

        # Embedding Layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)

        # BiLSTM Encoder
        self.encoder = nn.LSTM(input_size=embedding_dim,
                               hidden_size=hidden_dim,
                               num_layers=1,
                               batch_first=True,
                               bidirectional=True)

        # Max Pooling (over time dimension)
        self.pooling = nn.AdaptiveMaxPool1d(1)

        # Task-specific output heads
        self.fc_pos_cat = nn.Linear(hidden_dim * 2, num_pos_cat)
        self.fc_pos = nn.Linear(hidden_dim * 2, num_pos)
        self.fc_root = nn.Linear(hidden_dim * 2, num_root)
        self.fc_number = nn.Linear(hidden_dim * 2, num_number)
        self.fc_gender = nn.Linear(hidden_dim * 2, num_gender)

    def forward(self, x):
        # x: (batch_size, max_seq_len)
        x_embed = self.embedding(x)  # (batch_size, max_seq_len, embedding_dim)
        lstm_out, _ = self.encoder(x_embed)  # (batch_size, max_seq_len, hidden_dim * 2)
        lstm_out = lstm_out.transpose(1, 2)  # (batch_size, hidden_dim * 2, max_seq_len)
        pooled = self.pooling(lstm_out).squeeze(-1)  # (batch_size, hidden_dim * 2)

        # Output heads
        return {
            'pos_cat': self.fc_pos_cat(pooled),
            'pos': self.fc_pos(pooled),
            'root': self.fc_root(pooled),
            'number': self.fc_number(pooled),
            'gender': self.fc_gender(pooled)
        }


In [8]:
model = MultitaskArabicModel(
    vocab_size=48,         # Number of characters in your vocab
    embedding_dim=64,      # Size of each character embedding
    hidden_dim=128,        # BiLSTM hidden size
    max_seq_len=9,         # Max lemma length
    num_pos_cat=10,        # Replace with actual class count
    num_pos=45,
    num_root=500,
    num_number=3,
    num_gender=3
)


In [9]:
def multitask_loss(outputs, targets, loss_weights=None):
    """
    outputs: dict of model outputs for each task
    targets: dict of ground truth labels for each task
    loss_weights: dict with weights for each task loss (optional)
    """
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0

    for task in outputs.keys():
        task_loss = loss_fn(outputs[task], targets[task])
        if loss_weights:
            task_loss *= loss_weights.get(task, 1.0)
        total_loss += task_loss

    return total_loss


In [10]:
# outputs from model
outputs = model(batch_input)

# ground truth labels
targets = {
    'pos_cat': pos_cat_batch,
    'pos': pos_batch,
    'root': root_batch,
    'number': number_batch,
    'gender': gender_batch
}

# optional: assign more importance to some tasks
loss_weights = {
    'pos_cat': 1.0,
    'pos': 1.0,
    'root': 0.8,
    'number': 0.6,
    'gender': 0.6
}

loss = multitask_loss(outputs, targets, loss_weights)


NameError: name 'batch_input' is not defined