In [None]:
# Hugging Face LLM Model :
from transformers import pipeline, BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = BertModel.from_pretrained("aubmindlab/bert-base-arabert")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Save The Models Locally : 
model.save_pretrained("llm_sentiment_analysis_model")
tokenizer.save_pretrained("llm_sentiment_analysis_model")

In [None]:
# Intinate Pipeline :
def load_models(path = "llm_sentiment_analysis_model") :
    # Load Our Models :
    tokenizer = BertTokenizer.from_pretrained(path)
    model = BertModel.from_pretrained(path)
    return model, tokenizer

In [30]:
# Load Saved Models :
model, tokenizer = load_models()

In [31]:
stored_db_words = {
    "مرحبا": "SIGNID#001",
    "فورًا": "SIGNID#002",
    "قريب": "SIGNID#003",
    "نحن": "SIGNID#004",
    "السلام عليكم": "SIGNID#005",
    "السلاموا عليكم": "SIGNID#006",
    "فريق": "SIGNID#006"
}

In [None]:
# Natural Language Processing
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tarek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
# Tokenizing Function :
def tokinze_text(text) :
    tokens =  word_tokenize(text, preserve_line = True)
    filtered_tokens = [token for token in tokens if token.isalpha()]
    return filtered_tokens

In [35]:
# Embedding Text Function :
def text_embedding(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

In [None]:
# Sentiment Analysis Model :
def sentiment_analysis(text):
    stored_db_words = {
        "مرحبا": "SIGNID#001",
        "فورًا": "SIGNID#002",
        "قريب": "SIGNID#003",
        "نحن": "SIGNID#004",
        "السلام عليكم": "SIGNID#005",
        "السلاموا عليكم": "SIGNID#006",
        "فريق": "SIGNID#006"
    }

    stored_words_embeddings = {
        word: text_embedding(word) for word in stored_db_words.keys()
    }

    tokens = tokinze_text(text)

    max_words = []
    corresponding_sign_ids = []

    for token in tokens:
        token_emb = text_embedding(token)

        word_similarities = {
            word: cosine_similarity(token_emb.unsqueeze(0), emb.unsqueeze(0)).item()
            for word, emb in stored_words_embeddings.items()
        }

        max_word = max(word_similarities, key=word_similarities.get)
        max_words.append(max_word)
        corresponding_sign_ids.append(stored_db_words[max_word])

    return max_words, corresponding_sign_ids

In [40]:
words, signsid = sentiment_analysis(text = "مرحيا نحن مجموعة !")

In [None]:
# Sentiment Words :
words

['مرحبا', 'نحن', 'فريق']

In [None]:
# SignIDs Of Sentiment Words :
signsid

['SIGNID#001', 'SIGNID#004', 'SIGNID#006']