<a href="https://colab.research.google.com/github/theoneandtheonlytaghyan/DEPI-Assignment/blob/main/raxi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas sentence-transformers scikit-learn torch nltk spacy matplotlib seaborn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
def clean_composition(text: str):
    """
    تُنظف دالة clean_composition عمود "Composition" لاستخلاص:
      - اسم المادة الفعالة (قبل الأقواس)
      - التركيز الموجود داخل الأقواس
    مثال:
      "ambroxol (30mg/5ml)" تُرجع ("ambroxol", "30mg/5ml")
    """
    if pd.isna(text):
        return "", None
    # الحصول على الجزء الأول قبل علامة الجمع (إن وُجد)
    main_part = text.split(" + ")[0]
    # استخراج المحتوى داخل الأقواس
    match = re.search(r'\(([^)]+)\)', main_part)
    concentration = match.group(1) if match else None
    # إزالة المحتوى بين الأقواس للحصول على الاسم النقي
    composition_clean = re.sub(r'\s*\([^)]+\)', '', main_part)
    return composition_clean.strip(), concentration

# اختبار الدالة
example_text = "Ambroxol (30mg/5ml)"
print("Test clean_composition:", clean_composition(example_text))

In [None]:
# تحميل البيانات
df = pd.read_csv('/content/raxi.csv')

# عرض البيانات
# تطبيق تنظيف عمود Composition لإنشاء الأعمدة composition_clean و concentration
df[['composition_clean', 'concentration']] = df['Composition'].apply(lambda x: pd.Series(clean_composition(x)))

df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.head(10)

In [None]:
df = df.drop('Composition', axis=1)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print(df.isnull().sum())
print(df.describe())
print(df.dtypes)
print(df.duplicated().sum())

In [None]:
print(df['concentration'].value_counts())
print(df['composition_clean'].value_counts())
print(df['Uses'].value_counts())
print(df['Side_effects'].value_counts())

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
df['text'] = df['Uses'] + ' ' + df['Side_effects']

In [None]:
import nltk
nltk.download('punkt_tab')
# Ensure the 'text' column contains only strings
df['text'] = df['text'].astype(str)
df['tokens'] = df['text'].apply(word_tokenize)

In [None]:
import nltk
nltk.download('stopwords') # Downloading stopwords dataset

stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:

nltk.download('wordnet') # Downloading wordnet dataset

lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [None]:
df['text'] = df['tokens'].apply(lambda x: ' '.join(x))

In [None]:
encodings = tokenizer(list(df['text']), truncation=True, padding=True)

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# تحميل النموذج
model = SentenceTransformer('all-MiniLM-L6-v2')

# تحويل النصوص إلى تمثيلات رياضية
texts = [str(row['Uses']) + ' ' + str(row['Side_effects']) for index, row in df.iterrows()]
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)

# تحليل النتائج
print(embeddings.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# حساب تشابه بين النصوص
similarities = cosine_similarity(embeddings)
print(similarities.shape)

In [None]:
# تحديد الأدوية التي لها تأثيرات جانبية مشابهة
similar_drugs = []
for i in range(len(similarities)):
    for j in range(i+1, len(similarities)):
        if similarities[i][j] > 0.8:  # حد التشابه
            similar_drugs.append((df.iloc[i]['Medicine Name'], df.iloc[j]['Medicine Name']))

print(similar_drugs)

In [None]:
# تحليل النتائج
import matplotlib.pyplot as plt
import seaborn as sns

# رسم توزيع التشابهات
plt.figure(figsize=(10, 6))
sns.histplot(similarities.flatten(), bins=50)
plt.title('Distribution of Similarities')
plt.xlabel('Similarity')
plt.ylabel('Frequency')
plt.show()

In [None]:
def calculate_compatibility(prescription, dispensed_medicine, df, model):
    """
    تحسب درجة التوافق بين الروشتة والدواء المصروف.

    Args:
        prescription (dict): بيانات الروشتة.
        dispensed_medicine (str): اسم الدواء المصروف.
        df (DataFrame): البيانات.
        model (SentenceTransformer): النموذج.

    Returns:
        float: درجة التوافق بنسبة مئوية.
    """

    # البحث عن الدواء المصروف في البيانات
    medicine_data = df[df['Medicine Name'] == dispensed_medicine]
    # Check if medicine_data is empty before accessing iloc[0]
    if medicine_data.empty:
        return 0  # إذا لم يتم العثور على الدواء، نعيد 0%
    else:
        medicine_data = medicine_data.iloc[0]

    # تحويل النصوص إلى تمثيلات رياضية
    prescription_uses_embedding = model.encode(prescription['Uses'])
    dataset_uses_embedding = model.encode(medicine_data['Uses'])
    prescription_side_effects_embedding = model.encode(prescription['Side_effects'])
    dataset_side_effects_embedding = model.encode(medicine_data['Side_effects'])

    # حساب التشابه بين النصوص
    uses_similarity = util.cos_sim(prescription_uses_embedding, dataset_uses_embedding).item()
    side_effects_similarity = util.cos_sim(prescription_side_effects_embedding, dataset_side_effects_embedding).item()

    # مقارنة المادة الفعالة والتركيز
    composition_match = prescription['composition_clean'] == medicine_data['composition_clean']
    concentration_match = prescription['concentration'] == medicine_data['concentration']

    # حساب درجة التوافق
    composition_score = 1 if composition_match else 0
    concentration_score = 1 if concentration_match else 0

    # حساب درجة التوافق النهائية
    compatibility_score = (0.4 * composition_score + 0.4 * concentration_score + 0.1 * uses_similarity + 0.1 * side_effects_similarity) * 100

    return compatibility_score


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# إنشاء مجموعة بيانات اختبار
test_prescriptions = [
    {'composition_clean': 'Paracetamol', 'concentration': '500mg', 'Uses': 'Headache, Fever', 'Side_effects': 'Nausea, dizziness'},
    {'composition_clean': 'Ibuprofen', 'concentration': '200mg', 'Uses': 'Pain, Inflammation', 'Side_effects': 'Stomach upset, Headache'},

]

test_medicines = ['Panadol Extra', 'Advil', 'Tylenol', 'Motrin']  # إضافة الأدوية المصروفة المناسبة

# حساب درجات التوافق للبيانات الاختبارية
test_scores = []
for prescription in test_prescriptions:
    for medicine in test_medicines:
        score = calculate_compatibility(prescription, medicine, df, model)
        test_scores.append(score)

# تقييم أداء النموذج
actual_scores = [90, 80, 10, 5, 95, 85, 15, 10]  # إضافة الدرجات الفعلية للتوافق
mse = mean_squared_error(actual_scores, test_scores)
print(f'Mean Squared Error: {mse:.2f}')

In [None]:
!pip install datasets
from datasets import Dataset, DatasetDict # Import Dataset and DatasetDict
from sentence_transformers import InputExample, losses, SentenceTransformer, util
from torch.utils.data import DataLoader

# إعداد بيانات التدريب
train_examples = []
for _, row in df.iterrows():
    # Ensure 'Uses' and 'Side_effects' are strings
    uses = str(row['Uses'])
    side_effects = str(row['Side_effects'])
    train_examples.append(InputExample(texts=[uses, side_effects], label=1.0))

# Convert train_examples to a Hugging Face Dataset
train_dataset = Dataset.from_list([{"texts": example.texts, "label": example.label} for example in train_examples])
# Wrap the dataset in a DatasetDict
train_dataset = DatasetDict({"train": train_dataset})

# Create a dataloader from the Hugging Face Dataset
train_dataloader = DataLoader(train_dataset['train'], shuffle=True, batch_size=16)
# حساب درجات التوافق بعد التدريب
fine_tuned_scores = []
for prescription in test_prescriptions:
    for medicine in test_medicines:
        score = calculate_compatibility(prescription, medicine, df, model)
        fine_tuned_scores.append(score)

# تقييم أداء النموذج بعد التدريب
fine_tuned_mse = mean_squared_error(actual_scores, fine_tuned_scores)
print(f'Fine-tuned Mean Squared Error: {fine_tuned_mse:.2f}')

# رسم توزيع درجات التوافق
plt.figure(figsize=(10, 6))
plt.hist(test_scores, bins=20, alpha=0.5, label='Original')
plt.hist(fine_tuned_scores, bins=20, alpha=0.5, label='Fine-tuned')
plt.title('Distribution of Compatibility Scores')
plt.xlabel('Compatibility Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# مثال على استخدام النموذج لتحديد ملائمتها للاستخدام
new_prescription = {
    'composition_clean': 'Bevacizumab',
    'concentration': '400mg',
    'Uses': ' Cancer of colon ',
    'Side_effects': 'Headache '
}

new_medicine = 'Avastin 400mg Injection'

new_compatibility_score = calculate_compatibility(new_prescription, new_medicine, df, model)
print(f'New Compatibility Score: {new_compatibility_score:.2f}%')

In [None]:
# دالة لحساب درجة التوافق
def calculate_compatibility(prescription, dispensed_medicine, df, model):
    """
    تحسب درجة التوافق بين الروشتة والدواء المصروف.

    Args:
        prescription (dict): بيانات الروشتة.
        dispensed_medicine (str): اسم الدواء المصروف.
        df (DataFrame): البيانات.
        model (SentenceTransformer): النموذج.

    Returns:
        float: درجة التوافق بنسبة مئوية.
    """

    # البحث عن الدواء المصروف في البيانات
    medicine_data = df[df['Medicine Name'] == dispensed_medicine]
    if medicine_data.empty:
        print(f"Error: Medicine '{dispensed_medicine}' not found in the database.")
        return 0  # Or raise an exception, etc.
    else:
        medicine_data = medicine_data.iloc[0]

    # تحويل النصوص إلى تمثيلات رياضية
    prescription_uses_embedding = model.encode(prescription['Uses'])
    dataset_uses_embedding = model.encode(medicine_data['Uses'])
    prescription_side_effects_embedding = model.encode(prescription['Side_effects'])
    dataset_side_effects_embedding = model.encode(medicine_data['Side_effects'])

    # حساب التشابه بين النصوص
    uses_similarity = util.cos_sim(prescription_uses_embedding, dataset_uses_embedding).item()
    side_effects_similarity = util.cos_sim(prescription_side_effects_embedding, dataset_side_effects_embedding).item()

    # مقارنة المادة الفعالة والتركيز
    composition_match = prescription['composition_clean'] == medicine_data['composition_clean']
    concentration_match = prescription['concentration'] == medicine_data['concentration']

    # حساب درجة التوافق
    composition_score = 1 if composition_match else 0
    concentration_score = 1 if concentration_match else 0

    # حساب درجة التوافق النهائية
    compatibility_score = (0.4 * composition_score + 0.4 * concentration_score + 0.1 * uses_similarity + 0.1 * side_effects_similarity) * 100

    return compatibility_score

# واجهة المستخدم
def user_test(df, model):
    print("مرحباً بك في اختبار ملائمة الدواء!")
    composition_clean = input("أدخل اسم المادة الفعالة: ")
    concentration = input("أدخل التركيز: ")
    uses = input("أدخل الاستخدامات: ")
    side_effects = input("أدخل الآثار الجانبية: ")
    dispensed_medicine = input("أدخل اسم الدواء المصروف: ")

    prescription = {
        'composition_clean': composition_clean,
        'concentration': concentration,
        'Uses': uses,
        'Side_effects': side_effects
    }

    compatibility_score = calculate_compatibility(prescription, dispensed_medicine, df, model)
    print(f'درجة التوافق: {compatibility_score:.2f}%')

# إجراء الاختبار
user_test(df, model)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# حفظ النموذج في Google Drive
model.save('/content/drive/MyDrive/my_model')

print("تم حفظ النموذج بنجاح في Google Drive في مجلد 'my_model'.")