# Medicine Recommendation System

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

import re
# a list of most common words in English
stop_words = [
    'a', 'about', 'after', 'all', 'also', 'an', 'and', 'any', 'as', 'at',
    'be', 'because', 'but', 'by', 'can', 'come', 'could', 'day', 'do', 'does', 'did', 'done',
    'dont', 'even', 'find', 'first', 'for', 'from', 'get', 'give', 'go', 'have', 'has', 'had',
    'he', 'her', 'here', 'him', 'his', 'how', 'i', 'ive', 'im', 'if', 'in', 'into',
    'it', 'its', 'just', 'know', 'like', 'look', 'make', 'man', 'many',
    'me', 'more', 'my', 'new', 'no', 'not', 'now', 'of', 'on', 'one',
    'only', 'or', 'other', 'our', 'out', 'people', 'say', 'see', 'she',
    'so', 'some', 'take', 'tell', 'than', 'that', 'the', 'their', 'them',
    'then', 'there', 'these', 'they', 'thing', 'think', 'this', 'those',
    'time', 'to', 'two', 'up', 'use', 'very', 'want', 'was', 'way', 'we', 'well',
    'what', 'when', 'which', 'who', 'will', 'with', 'would', 'year', 'you',
    'your'
]

from sklearn.feature_extraction.text import CountVectorizer

## Load & Preprocess the Datasets

In [4]:
train_df = pd.read_csv("drugsComTrain_raw.csv")
print("Number of entries in the train data = %d"%(len(train_df)))
train_df.head()


Number of entries in the train data = 161297


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [5]:
test_df = pd.read_csv("drugsComTest_raw.csv")
print("Number of entries in the test data = %d"%(len(test_df)))
test_df.head()


Number of entries in the test data = 53766


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


In [6]:
# unique set of conditions
conditions_train = set(train_df['condition'])
print("Number of conditions in the train data = %d"%(len(conditions_train)))

conditions_test = set(test_df['condition'])
print("Number of conditions in the test data = %d"%(len(conditions_test)))

# test conditions which are not in the train set
print("%d test conditions are not found in the train data" %len(conditions_test - conditions_train))


Number of conditions in the train data = 885
Number of conditions in the test data = 709
32 test conditions are not found in the train data


In [7]:
# unique set of drugs
drugs_train = set(train_df['drugName'])
print("Number of drugs in the train data = %d"%(len(drugs_train)))

drugs_test = set(test_df['drugName'])
print("Number of drugs in the test data = %d"%(len(drugs_test)))

# test drugs which are not in the train set
print("%d test drugs are not found in the train data" %len(drugs_test - drugs_train))


Number of drugs in the train data = 3436
Number of drugs in the test data = 2637
235 test drugs are not found in the train data


### Data Cleaning

The `drugName` column has sometimes multiple drugs mentioned in the same column (e.g. Acetaminophen / chlorpheniramine / dextromethorphan / pseudoephedrine). Split them on '/'. But it can introduce unwanted splitting (e.g. Cyclafem 7 / 7 / 7 is a single drug Cyclafem 7-7-7). Convert all drug names to lowercase to avoid any duplicates.

#### Clean Training Data

In [8]:
# map drugs to symptoms:
# Use two separate lists for that.
train_symptoms = []
train_drugs = []

for idx, row in train_df.iterrows():
    # convert review text to lower case
    review = row['review'].lower().replace("&#039;", "'") \
                                        .replace("&amp;", ' ') \
                                        .replace("&quot;", ' ')
    
    # remove punctuations & digits
    review = re.sub(r'[^A-Za-z ]+', '', review)
        
    # remove stopwords (common english words such as "the")
    filtered_tokens = [word for word in review.split() if word not in stop_words]
    review = ' '.join(filtered_tokens)
    
    # add the condition to the symptoms for better review
    condition = row['condition']
    if not pd.isna(condition) and "comment" not in condition:
        # convert to lowercase
        condition = condition.lower()
        # keep only alphabets
        condition = re.sub(r'[^A-Za-z ]+', '', condition)
    
        # add to the review
        review = condition + " " + review

    # process drug names
    drugName = row['drugName']
    # Replace "/" with "-" when surrounded by a number or a single alphabet
    drugName = re.sub(r'(\b\d|[A-Za-z])\s*\/\s*(\d|\b[A-Za-z])\b', r'\1-\2', drugName)
    
    # if multiple drugs in the same line separated by "/"
    for drug in drugName.split('/'):
        train_drugs.append(drug.strip().lower())
        train_symptoms.append(review)
    

In [9]:
len(train_drugs)


183239

In [10]:
train_symptoms[10000]


'hiv infection jan th diagnosed hiv cd vl didnt stribild til may nd taking months reached cd vl und medicaid without supplement plan pay everything pay copay'

In [11]:
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the train symptoms to learn the vocabulary
vectorizer.fit(train_symptoms)

# Get the vocabulary
vocabulary = vectorizer.vocabulary_

print(len(vocabulary))


75160


#### Clean Test Data

In [12]:
# map drugs to symptoms:
# Use two separate lists for that.
test_symptoms = []
test_drugs = []

for idx, row in test_df.iterrows():
    # convert review text to lower case
    review = row['review'].lower().replace("&#039;", "'") \
                                        .replace("&amp;", ' ') \
                                        .replace("&quot;", ' ')
    
    # remove punctuations & digits
    review = re.sub(r'[^A-Za-z ]+', '', review)
        
    # remove stopwords (common english words such as "the")
    filtered_tokens = [word for word in review.split() if word not in stop_words]
    review = ' '.join(filtered_tokens)
    
    # add the condition to the symptoms for better review
    condition = row['condition']
    if not pd.isna(condition) and "comment" not in condition:
        # convert to lowercase
        condition = condition.lower()
        # keep only alphabets
        condition = re.sub(r'[^A-Za-z ]+', '', condition)
    
        # add to the review
        review = condition + " " + review

    # process drug names
    drugName = row['drugName']
    # Replace "/" with "-" when surrounded by a number or a single alphabet
    drugName = re.sub(r'(\b\d|[A-Za-z])\s*\/\s*(\d|\b[A-Za-z])\b', r'\1-\2', drugName)
    
    # if multiple drugs in the same line separated by "/"
    # if multiple drugs in the same line separated by "/"
    for drug in drugName.split('/'):
        test_drugs.append(drug.strip().lower())
        test_symptoms.append(review)
    

In [13]:
len(test_drugs)


61004

In [14]:
# re-calculate unique set of drugs
drugs_train = set(train_drugs)
print("Number of drugs in the train data = %d"%(len(drugs_train)))

drugs_test = set(test_drugs)
print("Number of drugs in the test data = %d"%(len(drugs_test)))

# test drugs which are not in the train set
print("%d test drugs are not found in the train data" %len(drugs_test - drugs_train))


Number of drugs in the train data = 3264
Number of drugs in the test data = 2520
216 test drugs are not found in the train data


## Data Preparation
Use Bag of Words approach using CountVectorizer to convert the symptoms text to a vector. 

Use one-hot encoding of the target (drugs)

In [15]:
from sklearn.preprocessing import OneHotEncoder

# Create an instance of OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the unique drugs to one-hot vectors
encoder.fit([[drug] for drug in drugs_train])


In [16]:
feature_names = [x.replace('x0_', '') for x in encoder.get_feature_names_out()]
len(feature_names)


3264

In [17]:
# Save the feature_names list
joblib.dump(feature_names, 'feature_names.pkl')


['feature_names.pkl']

In [18]:
# encode training data labels
train_drugs_one_hot = encoder.transform([[drug] for drug in train_drugs])
train_drugs_one_hot.shape


(183239, 3264)

In [19]:
# vectorize the training data & test data
train_data = vectorizer.transform(train_symptoms)
test_data = vectorizer.transform(test_symptoms)


In [20]:
train_data.shape


(183239, 75160)

## Build PyTorch Model
We use a feed forward neural network having two hidden layers with 100 neurons each.

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from torchsummary import summary


In [22]:
# Define the dimensions
input_dim = train_data.shape[1] # 75160
hidden_dim = 100
output_dim = train_drugs_one_hot.shape[1] # 3264

# Define the neural network architecture
class FeedForwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout1 = nn.Dropout(0.25)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout2 = nn.Dropout(0.25)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.dropout1(torch.relu(self.fc1(x)))
        x = self.dropout2(torch.relu(self.fc2(x)))
        x = self.fc3(x)
        # x = self.softmax(x)
        return x

# Create the model instance
model = FeedForwardNN(input_dim, hidden_dim, output_dim)
summary(model, (input_dim,))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 100]       7,516,100
           Dropout-2                  [-1, 100]               0
            Linear-3                  [-1, 100]          10,100
           Dropout-4                  [-1, 100]               0
            Linear-5                 [-1, 3264]         329,664
Total params: 7,855,864
Trainable params: 7,855,864
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.29
Forward/backward pass size (MB): 0.03
Params size (MB): 29.97
Estimated Total Size (MB): 30.28
----------------------------------------------------------------


## Train the Model

In [23]:
# Custom dataset to convert data batch by batch
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return train_drugs_one_hot.shape[0]

    def __getitem__(self, index):
        # Convert the sparse matrix batch to dense tensor
        data_batch = self.data[index].toarray()  
        target_batch = self.targets[index].toarray()
        return torch.FloatTensor(data_batch), target_batch


In [24]:
# Set the batch size
batch_size = 128

# Create a custom dataset for training
train_dataset = CustomDataset(train_data, train_drugs_one_hot)

# Create a DataLoader for batch training
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create the model instance
model = FeedForwardNN(input_dim, hidden_dim, output_dim)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()


In [25]:
# Run training for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
    # Training loop
    model.train()
    total_loss = 0.0
    for i, (inputs, targets) in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print training loss occasionally
        if (i + 1) % 500 == 0:  # Print every 500 batches
            avg_loss = total_loss / 500
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_dataloader)}], Average Loss: {avg_loss}")
            total_loss = 0.0
            

Epoch [1/10], Batch [500/1432], Average Loss: 0.03406975776716441
Epoch [1/10], Batch [1000/1432], Average Loss: 0.0026448839961140717
Epoch [2/10], Batch [500/1432], Average Loss: 0.002257630647933867
Epoch [2/10], Batch [1000/1432], Average Loss: 0.002119816903261292
Epoch [3/10], Batch [500/1432], Average Loss: 0.0018470931884344015
Epoch [3/10], Batch [1000/1432], Average Loss: 0.0017238615122746673
Epoch [4/10], Batch [500/1432], Average Loss: 0.0015513087568162572
Epoch [4/10], Batch [1000/1432], Average Loss: 0.0014839185376208392
Epoch [5/10], Batch [500/1432], Average Loss: 0.0013550939630695472
Epoch [5/10], Batch [1000/1432], Average Loss: 0.0013178650181678397
Epoch [6/10], Batch [500/1432], Average Loss: 0.001214836063962415
Epoch [6/10], Batch [1000/1432], Average Loss: 0.0011842351616439964
Epoch [7/10], Batch [500/1432], Average Loss: 0.0011019682705663233
Epoch [7/10], Batch [1000/1432], Average Loss: 0.0010874704196915436
Epoch [8/10], Batch [500/1432], Average Loss: 

In [26]:
# save the trained model
torch.save(model.state_dict(), 'model.pt')


## Make Predictions

In [27]:
def make_prediction(model, test_sample, n=3):
    '''
    Return top-n predictions of the test_sample
    '''
    # model in evaluation mode
    model.eval()

    # convert test sample to torch tensor
    test_sample = test_sample.toarray()
    test_tensor = torch.FloatTensor(test_sample)
    
    # make prediction
    output = model(test_tensor)
    softmax = F.softmax(output, dim=1)[0]
    
    # Get the top n values and their indices
    top_values, top_indices = torch.topk(softmax, k=n)
    
    top_n_idx = top_indices[:n]
    
    result = []
    for idx in top_n_idx:
        result.append(feature_names[idx])

    return result


In [28]:
make_prediction(model, test_data[0])


['mirtazapine', 'citalopram', 'remeron']

## Check Accuracy
We perform a top-n accuracy check on the train & test datasets.

The model will make n (typically n=3) predictions. If the actual drug given matches any of this n drugs, we call that as a success. 

In [None]:
# check train accuracy
correct = 0
for sample, label in zip(train_data, train_drugs):
    # get the top-n outputs
    preds = make_prediction(model, sample, n=3)

    if label in preds:
        correct += 1

In [None]:
train_acc = correct/train_data.shape[0]
print("Train Accuracy = %.3f%%"%(train_acc*100))


In [None]:
# check test accuracy
correct = 0
for sample, label in zip(test_data, test_drugs):
    # get the top-n outputs
    preds = make_prediction(model, sample, n=3)

    if label in preds:
        correct += 1
        
test_acc = correct/test_data.shape[0]
print("Test Accuracy = %.3f%%"%(test_acc*100))


In [None]:
import joblib

# Save the CountVectorizer object
joblib.dump(vectorizer, 'predictions/vectorizer.pkl')



In [None]:
#!pip install streamlit
!pip install pickle


In [None]:
!streamlit run filename.py
