<a href="https://colab.research.google.com/github/ussef11/Certification_MERN/blob/main/NLP_SVM(Discriminative).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from pathlib import Path
from datetime import datetime as dt
import pandas as pd
from google.colab import files
import os
from glob import glob


In [6]:
from  google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
def read_bio_file(path, encoding="utf-8"):
    sentences = []
    sent = []

    with open(path, "r", encoding=encoding) as f:
        for line in f:
            line = line.strip()

            # blank line = sentence boundary
            if not line:
                if sent:
                    sentences.append(sent)
                    sent = []
                continue

            # expect: "token TAG"
            parts = line.split(" ")
            if len(parts) < 2:
                # skip malformed lines (rare)
                continue

            token = parts[0]
            tag = parts[-1]  # in case there are extra spaces, keep last as tag
            sent.append((token, tag))

    # last sentence if file doesn't end with blank line
    if sent:
        sentences.append(sent)

    return sentences

def read_bio_folder(folder_path):
    all_sentences = []
    all_files = sorted(glob(os.path.join(folder_path, "**", "*.txt"), recursive=True))

    for fp in all_files:
        sents = read_bio_file(fp)
        all_sentences.extend(sents)

    return all_sentences, all_files


In [8]:
DATA_PATH = '/content/drive/MyDrive/AQMAR_Arabic_NER_corpus-1.0'

sents, file_list = read_bio_folder(DATA_PATH)

print("Number of files:", len(file_list))
print("Number of sentences:", len(sents))
print("First sentence length:", len(sents[0]))

print("First 10 tokens/tags in first sentence:")
for tok, tag in sents[0][:10]:
    print(tok, tag)


Number of files: 40
Number of sentences: 2689
First sentence length: 14
First 10 tokens/tags in first sentence:
الذرة O
هي O
أصغر O
جزء O
من O
العنصر O
الكيميائي O
الذي O
يحتفظ O
بالخصائص O


In [9]:
tokens = [[tok for tok, tag in sent] for sent in sents]
tags   = [[tag for tok, tag in sent] for sent in sents]

print(tokens[0][:10])
print(tags[0][:10])

['الذرة', 'هي', 'أصغر', 'جزء', 'من', 'العنصر', 'الكيميائي', 'الذي', 'يحتفظ', 'بالخصائص']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [10]:
def normalize_tags(sents):
    fixed = []
    for sent in sents:
        new_sent = []
        for tok, tag in sent:
            if tag == "0":
                tag = "O"
            new_sent.append((tok, tag))
        fixed.append(new_sent)
    return fixed

sents = normalize_tags(sents)


In [11]:
tokens = [[tok for tok, tag in sent] for sent in sents]
tags   = [[tag for tok, tag in sent] for sent in sents]


print(tokens[0][:10])
print(tags[0][:10])

['الذرة', 'هي', 'أصغر', 'جزء', 'من', 'العنصر', 'الكيميائي', 'الذي', 'يحتفظ', 'بالخصائص']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [12]:
def token_features(sent, i):
    w = sent[i][0]
    feats = {
        "w": w,
        "w_len": len(w),
        "is_digit": w.isdigit(),
        "pref2": w[:2],
        "pref3": w[:3],
        "suf2": w[-2:],
        "suf3": w[-3:],
    }

    # previous word
    if i > 0:
        w_prev = sent[i-1][0]
        feats.update({
            "prev_w": w_prev,
            "prev_pref2": w_prev[:2],
            "prev_suf2": w_prev[-2:],
        })
    else:
        feats["BOS"] = True

    # next word
    if i < len(sent) - 1:
        w_next = sent[i+1][0]
        feats.update({
            "next_w": w_next,
            "next_pref2": w_next[:2],
            "next_suf2": w_next[-2:],
        })
    else:
        feats["EOS"] = True

    return feats


test = token_features(sents[0], 0)
print(test)

{'w': 'الذرة', 'w_len': 5, 'is_digit': False, 'pref2': 'ال', 'pref3': 'الذ', 'suf2': 'رة', 'suf3': 'ذرة', 'BOS': True, 'next_w': 'هي', 'next_pref2': 'هي', 'next_suf2': 'هي'}


In [13]:
X_feats, y_labels, sent_ids = [], [], []

for sid, sent in enumerate(sents):
    for i in range(len(sent)):
        X_feats.append(token_features(sent, i))
        y_labels.append(sent[i][1])
        sent_ids.append(sid)

len(X_feats), len(y_labels), len(set(sent_ids))


(143432, 143432, 2689)

In [14]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X_feats, y_labels, groups=sent_ids))

X_train = [X_feats[i] for i in train_idx]
y_train = [y_labels[i] for i in train_idx]

X_test  = [X_feats[i] for i in test_idx]
y_test  = [y_labels[i] for i in test_idx]

len(X_train), len(X_test)


(103597, 39835)

In [1]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

svm_model = Pipeline([
    ("vec", DictVectorizer(sparse=True)),
    ("clf", LinearSVC())
])

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)


NameError: name 'X_train' is not defined

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=4))


In [None]:
import re

def simple_ar_tokenize(text):
    print(text)
    text = re.sub(r"([.,!?;:()\[\]{}\"'،؛؟])", r" \1 ", text)
    print(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split(" ")

def predict_sentence_svm(model, text):
    tokens = simple_ar_tokenize(text)
    sent = [(t, "O") for t in tokens]
    X = [token_features(sent, i) for i in range(len(sent))]
    y_pred = model.predict(X)
    return tokens, list(y_pred)


In [None]:
test_text = " والمسلمين القسطنطينية بدأت الحملات  الصَليبية .في القرن الحادي عشر أوروبا"
tokens, pred_tags = predict_sentence_svm(svm_model, test_text)

for t, tag in zip(tokens, pred_tags):
    print(t, tag)
