In [1]:
# Title: Naive Bayes Drug Recommendation Model with Multi-Label Classification (Filtered)
# Description: This notebook builds a supervised learning model for drug recommendation using a Naive Bayes classifier wrapped in OneVsRestClassifier for multi-label classification. Rare labels are filtered to improve model performance.

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
import joblib


In [2]:
# Load the dataset
data = pd.read_csv('drugsComTrain_raw.csv')
data.head()


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [3]:
# Convert condition to lowercase and clean up
data['condition'] = data['condition'].str.lower().str.strip()

# Create dummy features from condition (can be customized based on symptoms if available)
X = pd.get_dummies(data['condition'])

# Multi-label binarization for drugs
mlb = MultiLabelBinarizer()
data['drug_labels'] = data['drugName'].apply(lambda x: [x])  # Wrap drugs in lists for multi-label
y = mlb.fit_transform(data['drug_labels'])

# Calculate label frequency
label_counts = np.sum(y, axis=0)

# Set a threshold for minimum label frequency
min_label_frequency = 5
common_labels = np.where(label_counts >= min_label_frequency)[0]

# Filter out uncommon labels from the target variable
y_filtered = y[:, common_labels]
mlb_filtered = MultiLabelBinarizer()
mlb_filtered.classes_ = mlb.classes_[common_labels]

# Display shape of filtered features and target
X.shape, y_filtered.shape


((161297, 884), (161297, 1846))

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_filtered, test_size=0.2, random_state=42)


In [None]:
# Wrap Naive Bayes with OneVsRestClassifier for multi-label classification
model = OneVsRestClassifier(MultinomialNB())
model.fit(X_train, y_train)


In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate with accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1 Score: {f1:.2f}")


In [None]:
# Save the Naive Bayes model and filtered multi-label binarizer for later use
joblib.dump(model, 'drug_recommendation_model_naive_bayes_filtered.pkl')
joblib.dump(mlb_filtered, 'multi_label_binarizer_filtered.pkl')
print("Naive Bayes model and filtered label binarizer saved successfully.")


In [None]:
# Load the model and filtered label binarizer if needed
model = joblib.load('drug_recommendation_model_naive_bayes_filtered.pkl')
mlb = joblib.load('multi_label_binarizer_filtered.pkl')

def recommend_drugs(condition):
    # Create dummy features from input condition
    condition = condition.lower().strip()
    input_vector = pd.get_dummies(pd.Series([condition]), columns=[condition]).reindex(columns=X.columns, fill_value=0)
    
    # Predict and transform labels back to drug names
    predicted = model.predict(input_vector)
    drugs = mlb.inverse_transform(predicted)
    return drugs[0] if drugs else ["No specific drug recommendation available"]

# Test the recommendation function
recommend_drugs("diabetes")
