In [2]:
!pip install transformers arabert preprocess
!pip install stanza
!pip install gensim
!pip install flair
!pip install lang-trans

Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl.metadata (16 kB)
Collecting preprocess
  Downloading preprocess-2.0.0-py3-none-any.whl.metadata (1.5 kB)
Collecting farasapy (from arabert)
  Downloading farasapy-0.1.1-py3-none-any.whl.metadata (11 kB)
Collecting emoji==1.4.2 (from arabert)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading preprocess-2.0.0-py3-none-any.whl (12 kB)
Downloading farasapy-0.1.1-py3-none-any.whl (14 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=1864

In [168]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from arabert.preprocess import ArabertPreprocessor
import stanza

import numpy as np
import pickle
import os
from tqdm import tqdm
from gensim.models import FastText
from flair.data import Sentence
from flair.embeddings import CharacterEmbeddings, StackedEmbeddings
from lang_trans.arabic import buckwalter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import json

In [169]:
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)

In [170]:
# loaded_weights = np.load('/content/drive/MyDrive/NLP Pro/features/arabic_char_embeddings_256.npy')

In [171]:
# len(loaded_weights)

In [172]:
padded_path = "/kaggle/input/arabicia/padded.pkl"
val_path = "/kaggle/input/arabicia/padded_val.pkl"

diacritic2id_path = "/kaggle/input/arabicia/diacritic2id.json"
idx2char_path = "/kaggle/input/arabicia/idx2char.json"
char2idx_path = "/kaggle/input/arabicia/char2idx.json"

In [173]:
def load_data_pickle(file_path):
    with open(file_path, 'rb') as f:
        X_raw, y_raw = pickle.load(f)
    X = []
    y = []

    for text_seq, label_seq in zip(X_raw, y_raw):
        x_ids = [c for c in text_seq]
        y_ids = [t for t in label_seq]

        X.append(x_ids)
        y.append(y_ids)

    return np.array(X), np.array(y)

In [174]:
with open(diacritic2id_path, 'r', encoding='utf-8') as f:
    diacritic2id = json.load(f)

id2label = {v: k for k, v in label2id.items()}

with open(char2idx_path, 'r', encoding='utf-8') as f:
    char2idx = json.load(f)

with open(idx2char_path, 'r', encoding='utf-8') as f:
    idx2char = json.load(f)

In [175]:
sentences, tashkeel_sequences = load_data_pickle(padded_path)

In [176]:

print("Sentences:", len(sentences))
print("Example sentence:", sentences[0][0])
print("Example tashkeel sequence:", tashkeel_sequences[0][0])
print("Example char2idx size:", len(char2idx))
print("Example diacritic2id size:", len(diacritic2id), diacritic2id[tashkeel_sequences[0][0]])

Sentences: 80254
Example sentence: و
Example tashkeel sequence: َ
Example char2idx size: 54
Example diacritic2id size: 15 0


# **Features**

In [177]:
arabert_model_name = "aubmindlab/bert-base-arabertv02"
bert_tokenizer = AutoTokenizer.from_pretrained(arabert_model_name)
bert_model = AutoModel.from_pretrained(arabert_model_name)
bert_model.eval()
arabert_prep = ArabertPreprocessor(model_name=arabert_model_name)

In [178]:
# # Model name (AraELECTRA large or small)
# MODEL_NAME = "aubmindlab/araelectra-base-discriminator"

# # Load tokenizer and model
# electra_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# electra_model = AutoModel.from_pretrained(MODEL_NAME)

# # Set to evaluation mode
# electra_model.eval()


In [179]:
# stanza.download('ar')
# nlp = stanza.Pipeline("ar", processors="tokenize,pos", use_gpu=False)

In [180]:
# vocab_size = len(char2idx) + 1
# embedding_dim = 128
# char_embedding = nn.Embedding(num_embeddings=vocab_size,
#                               embedding_dim=embedding_dim,
#                               padding_idx=0)

In [181]:
def remove_pads(sentence):
    chars = [c for c in sentence if c != "<PAD>"]
    sentence_text = "".join(chars)
    text = sentence_text.replace('\uFFFD', '?')
    return text

In [182]:
new_sentences = [remove_pads(sentence) for sentence in sentences]

In [183]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print(f"🚀 GPU Detected: {torch.cuda.get_device_name(0)}")
    print(f"   Memory Usage: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
bert_model.to(device)

🚀 GPU Detected: Tesla T4
   Memory Usage: 1.02 GB


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(64000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

# **AraBERT**

In [184]:
def get_arabert_embeddings(sentence: str):
    
    tokens = bert_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    tokens = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        output = bert_model(**tokens)

    emb = output.last_hidden_state.squeeze(0).cpu()
    token_list = bert_tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

    return emb.numpy(), token_list

In [185]:
emb, toks = get_arabert_embeddings(new_sentences[0])
print("Embedding shape:", type(emb))
print("Tokens:", toks)

Embedding shape: <class 'numpy.ndarray'>
Tokens: ['[CLS]', 'ولو', 'جمع', 'ثم', 'علم', 'ترك', 'ركن', 'من', 'الأولى', 'بطل', '##تا', 'ويعيد', '##هما', 'جامعا', '،', 'أو', 'من', 'الثانية', '،', 'فإن', 'لم', 'يطل', 'تدارك', '،', 'وإلا', 'فب', '##اط', '##لة', 'ولا', 'جمع', '،', 'ولو', 'جهل', 'أعاد', '##هما', 'لوقت', '##يهم', '##ا', '[SEP]']


In [186]:
# tokens = []
# bert_embeddings = []

# for sent in sentences:
#   emb, tok = get_arabert_embeddings(sentences,device)
#   bert_embeddings.append(emb)
#   tokens.append(tok)

# **POS Tagging**

In [187]:
# def extract_pos_tags(sentence: str):
#     doc = nlp(sentence)
#     pos_tags = []
#     for sent_obj in doc.sentences:
#         for word in sent_obj.words:
#             pos_tags.append(word.upos)
#     return pos_tags

In [188]:
# print(extract_pos_tags(sentences[0])[:10])

# **Char Level Embedding**

In [189]:
def char_encode(sentence: str):
    return [char2idx[c] for c in sentence]

def char_embed(sentence: str):
    ids = torch.tensor(char_encode(sentence))
    return char_embedding(ids).detach().numpy()

In [190]:
# print("Char embedding:", char_embed('ر'))

# **Fast Text Word Embedding**

In [191]:
# def get_arabic_tokens(corpus):
#     data = [sentence.split() for sentence in corpus]
#     return data


In [192]:
# def train_fasttext_arabic(corpus, embedding_size=100, window_size=5, min_count=3, epochs=50, model_path="./models/ft_arabic_model"):
#     """
#     Train a FastText model on raw Arabic tokens
#     """
#     data = get_arabic_tokens(corpus)

#     # Initialize FastText model
#     ft_model = FastText(
#         vector_size=embedding_size,
#         window=window_size,
#         min_count=min_count,
#         workers=4,
#         sg=1  # Skip-gram
#     )
#     ft_model.build_vocab(corpus_iterable=data)
#     ft_model.train(corpus_iterable=data, total_examples=len(data), epochs=epochs)

#     # Save model
#     os.makedirs(os.path.dirname(model_path), exist_ok=True)
#     ft_model.save(model_path)
#     print(f"✅ FastText Arabic model saved at {model_path}")

#     return ft_model

In [193]:
# def extract_fasttext_embeddings_arabic(corpus, ft_model):
#     """
#     Extract FastText embeddings for each word in the corpus
#     Returns a list of sentences, where each sentence is a list of word vectors
#     """
#     data = get_arabic_tokens(corpus)
#     all_embeddings = []

#     for sentence in data:
#         sentence_embeddings = []
#         for word in sentence:
#             vec = ft_model.wv[word]  # FastText handles OOV words via subword info
#             sentence_embeddings.append(vec)
#         all_embeddings.append(sentence_embeddings)

#     return all_embeddings

In [194]:
# ft_model = train_fasttext_arabic(sentences)

In [195]:
# extract_fasttext_embeddings_arabic(sentences[0],ft_model)

In [196]:
# !cp -r "./models/" '/content/drive/MyDrive/NLP/'

# **FLAIR Char Embedding**

In [197]:
char_embedding = CharacterEmbeddings()

def extract_char_embeddings(sentence_text, embedding_model=None):

    if embedding_model is None:
        embedding_model = char_embedding
    sentence = Sentence(sentence_text)
    embedding_model.embed(sentence)  

    return [token.embedding.detach().cpu() for token in sentence]

In [198]:
# extract_char_embeddings(sentence_text)

# **AraELECTRA**

In [199]:
# def get_araelectra_embeddings(sentence, model, tokenizer, device="cpu"):
#     """
#     Get token-level embeddings from AraELECTRA
#     Returns a list of sentence embeddings (list of token embeddings)
#     """
#     model.to(device)
#     inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     # Get outputs (last hidden state)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
#     # Remove batch dimension and convert to list of embeddings per token
#     token_embeddings = last_hidden_state.squeeze(0)  # [seq_len, hidden_size]
#     return token_embeddings

In [200]:
# # Example: using your Buckwalter-transliterated sentences or cleaned Arabic
# ara_embeddings = get_araelectra_embeddings(sentences[0], electra_model, electra_tokenizer, device)

# print("Number of tokens in first sentence:", ara_embeddings.shape[0])
# print("Embedding dimension:", ara_embeddings.shape[1])
# print(ara_embeddings)

# **Buckwalter Translation**

In [201]:
# def get_buckwalter_translation(sentence):
#   return buckwalter.transliterate(sentence)

In [202]:
# print("Original sentence:", sentences[0])
# print("Buckwalter transliterated:", get_buckwalter_translation(sentences[0]))

In [203]:
# buckwalter_sentences = []
# for s in sentences:
#   buckwalter_sentences.append(get_buckwalter_translation(s))

# **TF-IDF**

In [204]:
# def tf_idf_features(sentences, save_path="models/tf_idf_buckwalter.csv"):
#     os.makedirs(os.path.dirname(save_path), exist_ok=True)

#     vectorizer = TfidfVectorizer(lowercase=False)  # Don't lowercase Buckwalter
#     tfidf_matrix = vectorizer.fit_transform(sentences)

#     feature_names = vectorizer.get_feature_names_out()
#     df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
#     df_tfidf.to_csv(save_path, index=False)

#     print(f"✅ TF-IDF features saved at {save_path}")
#     return df_tfidf, vectorizer

In [205]:
# tfidf_df, tfidf_vectorizer = tf_idf_features(buckwalter_sentences)
# print("TF-IDF shape:", tfidf_df.shape)
# print("Example TF-IDF features:", tfidf_df.columns[:10])

# **BOW**

In [206]:
# def bow_features(sentences, save_path="models/bow_buckwalter.csv"):
#     os.makedirs(os.path.dirname(save_path), exist_ok=True)

#     vectorizer = CountVectorizer()
#     bow_matrix = vectorizer.fit_transform(sentences)

#     feature_names = vectorizer.get_feature_names_out()
#     df_bow = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)
#     df_bow.to_csv(save_path, index=False)

#     print(f"✅ Bag-of-Words features saved at {save_path}")
#     return df_bow, vectorizer

In [207]:
# bow_df, bow_vectorizer = bow_features(buckwalter_sentences)
# print("BoW shape:", bow_df.shape)
# print("Example BoW features:", bow_df.columns[:10])

# **Labeling**

In [208]:
def get_tashkeel_sequence(index: int):
    return tashkeel_sequences[index]

In [209]:
print(get_tashkeel_sequence(0)[:10])

['َ' 'َ' 'ْ' '' 'َ' 'َ' 'َ' '' 'ُ' 'َّ']


# **Full Feature Pipeline**

In [210]:
def tokens_to_word_embeddings(tokens, embeddings):
    word_embeddings = []
    current_word_embs = []

    for token, emb in zip(tokens, embeddings):
        emb_tensor = torch.tensor(emb) if isinstance(emb, np.ndarray) else emb

        if token.startswith("##"):
            current_word_embs.append(emb_tensor)
        else:
            if current_word_embs:
                word_embeddings.append(torch.mean(torch.stack(current_word_embs), dim=0))
            current_word_embs = [emb_tensor]

    if current_word_embs:
        word_embeddings.append(torch.mean(torch.stack(current_word_embs), dim=0))

    return torch.stack(word_embeddings)


In [211]:
def zizo_features(sentence: str,
    sent_index: int,
    arabert_model=None,
    arabert_tokenizer=None,
    fasttext_model=None):

  sentence_vec = []
    
  tashkeel = get_tashkeel_sequence(sent_index)

  final_tashkeel = []
  for i,char in enumerate(sentence):
    if char == ' ':
      continue
    final_tashkeel.append(tashkeel[i])
    

  arabert_emb, tokens = get_arabert_embeddings(sentence)
  final_arabert_emb = tokens_to_word_embeddings(tokens,arabert_emb)

  for i,word in enumerate(sentence.split()):
      for char in word:
        char_emb = extract_char_embeddings(char)
        bert_vec = final_arabert_emb[i]
        char_emb_array = np.array(char_emb)
        char_vector = np.concatenate([final_arabert_emb[i], char_emb_array.flatten()])
        sentence_vec.append(char_vector)
  return sentence_vec,final_tashkeel


In [212]:
f,t = zizo_features(new_sentences[0], 0,bert_model,bert_tokenizer)

In [213]:
len(f)

108

In [214]:
len(f[0])

818

In [215]:
len(new_sentences[0])

137

In [216]:
len(t)

108

In [None]:
import h5py
import numpy as np
import os
from tqdm import tqdm

H5_PATH = "/kaggle/working/zizo_dataset_2.h5"
CHECKPOINT_INDEX_PATH = "/kaggle/working/zizo_index_2.txt"

SAVE_EVERY = 100

def map_tashkeel(char_list):
    return [diacritic2id[c] for c in char_list]


start_index = 0
if os.path.exists(CHECKPOINT_INDEX_PATH):
    with open(CHECKPOINT_INDEX_PATH, "r") as f:
        start_index = int(f.read())
    print(f"Resuming from index: {start_index}")

buffer_features = []
buffer_tashkeel = []

for i in tqdm(range(start_index, len(new_sentences))):
    sent = new_sentences[i]

    f, t = zizo_features(sent, i, bert_model, bert_tokenizer)

    buffer_features.extend(f)

    t_ids = map_tashkeel(t)
    buffer_tashkeel.extend(t_ids)

    assert len(f) == len(t_ids), "They should be equal"

    if (i + 1) % SAVE_EVERY == 0 or i == len(new_sentences) - 1:

        batch_features = np.array(buffer_features, dtype=np.float32)

        batch_tashkeel = np.array(buffer_tashkeel, dtype=np.int32)

        with h5py.File(H5_PATH, 'a') as hf:
            if 'features' not in hf:
                hf.create_dataset('features', data=batch_features, maxshape=(None, 1024), chunks=True)
                hf.create_dataset('tashkeel', data=batch_tashkeel, maxshape=(None,), chunks=True)
            else:
                hf['features'].resize((hf['features'].shape[0] + batch_features.shape[0]), axis=0)
                hf['tashkeel'].resize((hf['tashkeel'].shape[0] + batch_tashkeel.shape[0]), axis=0)

                hf['features'][-batch_features.shape[0]:] = batch_features
                hf['tashkeel'][-batch_tashkeel.shape[0]:] = batch_tashkeel

        with open(CHECKPOINT_INDEX_PATH, "w") as f:
            f.write(str(i + 1))

        buffer_features = []
        buffer_tashkeel = []

 42%|████▏     | 34018/80254 [1:05:40<1:54:10,  6.75it/s]

In [None]:
# import pickle
# import os
# from tqdm import tqdm

# DRIVE_CHECKPOINT_PATH = "/kaggle/working/zizo_checkpoint.pkl"

# final_features = []
# final_tashkeel = []

# START_INDEX = 0
# SAVE_EVERY = 100

# # Resume if Drive checkpoint exists
# if os.path.exists(DRIVE_CHECKPOINT_PATH):
#     print("🔁 Resuming from checkpoint...")
#     with open(DRIVE_CHECKPOINT_PATH, "rb") as f:
#         data = pickle.load(f)
#         final_features = data["features"]
#         final_tashkeel = data["tashkeel"]
#         START_INDEX = data["index"] + 1
#     print(f"➡️ Resumed from index: {START_INDEX}")

# for i in tqdm(range(START_INDEX, len(sentences))):
#     sent = sentences[i]

#     f, t = zizo_features(sent, i, bert_model, bert_tokenizer, device)
#     final_features.append(f)
#     final_tashkeel.append(t)

#     if (i + 1) % SAVE_EVERY == 0 or i == len(sentences) - 1:
#         checkpoint_data = {
#             "features": final_features,
#             "tashkeel": final_tashkeel,
#             "index": i
#         }
#         with open(DRIVE_CHECKPOINT_PATH, "wb") as f:
#             pickle.dump(checkpoint_data, f)

#         print(f"💾 Direct checkpoint saved to Drive at index {i}")

In [None]:
with open("zizo_features.pkl", "wb") as f:
    pickle.dump(final_features, f)

In [None]:
with open("zizo_tashkeel.pkl", "wb") as f:
    pickle.dump(final_tashkeel, f)

In [None]:
!cp zizo_features.pkl '/content/drive/MyDrive/NLP Pro/zizo_features.pkl'

In [None]:
!cp zizo_tashkeel.pkl '/content/drive/MyDrive/NLP Pro/zizo_tashkeel.pkl'

In [None]:
def extract_all_features(
    sentence: str,
    sent_index: int,
    arabert_model=None,
    arabert_tokenizer=None,
    araelectra_model=None,
    araelectra_tokenizer=None,
    fasttext_model=None,
    flair_char_embed=None,
    buckwalter_enabled=True,
    device = "cpu"
):
    """
    Extract all features for a single sentence.

    Returns a dictionary with:
    - tokens
    - ArabERT embeddings (list of token embeddings)
    - POS tags
    - Flair char embeddings (list of char-level embeddings)
    - Tashkeel sequence (diacritics)
    - Buckwalter transliteration (optional)
    - FastText embeddings (list of word embeddings)
    - AraELECTRA sentence embedding
    """

    features = {}

    # --- Tokens + ArabERT embeddings ---
    arabert_emb, tokens = get_arabert_embeddings(sentence)
    features["tokens"] = tokens
    features["arabert_embeddings"] = arabert_emb

    # --- POS tags ---
    features["pos"] = extract_pos_tags(sentence)

    # --- Flair char embeddings ---
    features["char_embeddings"] = extract_char_embeddings(sentence) if flair_char_embed else None

    # --- Tashkeel / diacritics ---
    features["diacritics"] = get_tashkeel_sequence(sent_index)

    # --- Buckwalter transliteration ---
    if buckwalter_enabled:
        from lang_trans.arabic import buckwalter
        features["buckwalter"] = buckwalter.transliterate(sentence)
        features['tf-idf']=tfidf_df.loc[sent_index]
        features['bow']=bow_df.loc[sent_index]

    # --- FastText embeddings ---
    if fasttext_model:
        # split sentence into words
        words = sentence.split()
        features["fasttext_embeddings"] = [fasttext_model.wv[word] if word in fasttext_model.wv else None for word in words]

    # --- AraELECTRA sentence embedding ---
    if araelectra_model and araelectra_tokenizer:
        sent_emb = get_araelectra_embeddings(sentence, araelectra_model, araelectra_tokenizer)
        features["araelectra_embedding"] = sent_emb

    return features

In [None]:

features = extract_all_features(sentences[0], 0,bert_model,bert_tokenizer,electra_model,electra_tokenizer,
    ft_model,True,True,device)
print(features)

{'tokens': ['[CLS]', 'ولو', 'جمع', 'ثم', 'علم', 'ترك', 'ركن', 'من', 'الأولى', 'بطل', '##تا', 'ويعيد', '##هما', 'جامعا', '،', 'أو', 'من', 'الثانية', '،', 'فإن', 'لم', 'يطل', 'تدارك', '،', 'وإلا', 'فب', '##اط', '##لة', 'ولا', 'جمع', '،', 'ولو', 'جهل', 'أعاد', '##هما', 'لوقت', '##يهم', '##ا', '[SEP]'], 'arabert_embeddings': array([[-0.6696945 ,  0.24876368, -0.48978496, ...,  2.1313782 ,
         1.5763175 , -0.3865996 ],
       [-1.014409  ,  1.6490163 ,  0.13049419, ...,  0.89443123,
         1.8422562 , -0.11962915],
       [-0.6900753 ,  2.8659966 , -0.11240197, ...,  1.0457464 ,
         1.979248  ,  0.48073858],
       ...,
       [-1.0904622 ,  0.67529905,  0.23306248, ...,  0.9060189 ,
         1.6246405 , -0.3823066 ],
       [-0.53947866,  0.45413914,  0.24050564, ...,  2.2699566 ,
         2.0605586 , -0.11449639],
       [ 0.78809386,  0.7225697 , -0.9659998 , ...,  1.5367293 ,
         1.793381  , -0.25377974]], dtype=float32), 'pos': ['CCONJ', 'CCONJ', 'VERB', 'CCONJ', 'VERB

In [None]:
print(features.keys())

dict_keys(['tokens', 'arabert_embeddings', 'pos', 'char_embeddings', 'diacritics', 'buckwalter', 'tf-idf', 'bow', 'fasttext_embeddings', 'araelectra_embedding'])


In [None]:
def extract_features_for_all(sentences):
    all_features = []
    for i, sent in tqdm(enumerate(sentences), total=len(sentences)):
        feats = extract_all_features(sent, i,bert_model,bert_tokenizer,electra_model,electra_tokenizer,
    ft_model,
    True,
    True,device)
        all_features.append(feats)
    return all_features


# ---- Run feature extraction for the whole dataset ----
full_feature_dataset = extract_features_for_all(sentences)

# ---- Save to file ----
with open("arabic_diacritization_features.pkl", "wb") as f:
    pickle.dump(full_feature_dataset, f)

print("🎯 All features extracted and saved successfully!")
print("📁 Output file: arabic_diacritization_features.pkl")
print("Total samples:", len(full_feature_dataset))

In [None]:
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
# !cp arabic_diacritization_features.pkl '/content/drive/MyDrive/NLP Pro/arabic_diacritization_features.pkl'

# **AraVec** (still not working)

In [None]:
# !unzip full_grams_cbow_300_twitter.zip -d aravec_twitter_cbow_300

In [None]:
# from gensim.models import KeyedVectors

# model_path = "full_grams_cbow_300_twitter.mdl"

# aravec = KeyedVectors.load(model_path)

In [None]:
# def tokenize(text):
#     text = arabert_prep.preprocess(text)
#     return text.split()

In [167]:
import torch
import numpy as np
from flair.data import Sentence

UNK_CHAR = '\uFFFD'

print(f"🕵️ Testing embeddings for: {UNK_CHAR}")

# --- CHECK 1: AraBERT ---
try:
    # See how the tokenizer handles it
    bert_tokens = bert_tokenizer.tokenize(UNK_CHAR)
    bert_ids = bert_tokenizer.encode(UNK_CHAR, add_special_tokens=False)
    
    print(f"\n✅ AraBERT Tokenization:")
    print(f"   Tokens: {bert_tokens}") 
    print(f"   IDs:    {bert_ids}")
    
    # If ID is 100 or 1 (depending on model), it is [UNK]. That is GOOD.
    if bert_tokenizer.unk_token_id in bert_ids:
        print("   -> Mapped to [UNK] token. This is SAFE.")
    else:
        print("   -> Mapped to a specific token. This is also SAFE.")
except Exception as e:
    print(f"   ❌ AraBERT Failed: {e}")

# --- CHECK 2: Flair / CharEmbeddings ---
try:
    print(f"\n✅ Character Embeddings:")
    s = Sentence(UNK_CHAR)
    char_embedding.embed(s)
    
    vec = s[0].embedding
    print(f"   Vector Shape: {vec.shape}")
    
    # Check if it returns a vector of all zeros (which implies no embedding found)
    if torch.sum(vec) == 0:
        print("   ⚠️ WARNING: This character resulted in a ZERO vector in Flair.")
        print("      Consider changing \uFFFD to a simpler placeholder like '?'")
    else:
        print("   -> Valid vector produced. This is SAFE.")
        
except Exception as e:
    print(f"   ❌ Flair Failed: {e}")

🕵️ Testing embeddings for: �

✅ AraBERT Tokenization:
   Tokens: []
   IDs:    []
   -> Mapped to a specific token. This is also SAFE.

✅ Character Embeddings:
   Vector Shape: torch.Size([50])
   -> Valid vector produced. This is SAFE.
