<a href="https://colab.research.google.com/github/vaibhavdangar09/Patient-s-Condition-Classification-Using-Drug-Reviews/blob/main/Patient_Condition_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Patient's Condition Classification Using Drug Reviews**

### **Project Overview:**


---


* Reviews are very important to get the overview of product whether it is service, offerings or products.

* Reviews also plays a very important role in healthcare domain especially in terms of drugs.

* By analyzing the reviews, we can get the understanding of the drug effectiveness and its side effects.

* But in this project, we will classify the condition of patient based on his review so that we can recommend him a suitable drug.



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,mean_squared_error
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# libraries for data preprocessing
from wordcloud import STOPWORDS
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import nltk
import string
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **Load Data Set**

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load train and test data
patient_review_train  = pd.read_csv('/content/drive/MyDrive/drugsComTrain_raw.csv')
patient_review_test  = pd.read_csv('/content/drive/MyDrive/drugsComTest_raw.csv')

### **Dataset First View**

In [None]:
patient_review_train.sample(10)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
25423,192550,Drospirenone / ethinyl estradiol,Birth Control,"""I got on this birth control when it first cam...",9,14-Aug-13,25
151680,13114,Xolair,Urticaria,"""I&#039;ve had hives off and on since 2002. I ...",3,28-Jun-17,10
96699,102533,Aripiprazole,Depression,"""I was told by my doctor that I might gain wei...",8,25-Apr-16,36
54052,8132,Zolpidem,Insomnia,"""I&#039;ve been taking 10 mg. every night for...",10,23-Jul-17,9
155936,33956,Junel Fe 1 / 20,Polycystic Ovary Syndrome,"""I really liked this birth control. I only exp...",10,4-Jul-14,23
151662,34584,Tysabri,Multiple Sclerosis,"""I have just had my second infusion and from t...",10,5-Aug-13,85
73085,17785,Ethinyl estradiol / etonogestrel,Birth Control,"""10+ years on it. Can&#039;t feel it. Never fa...",10,23-May-15,18
3997,186089,Desvenlafaxine,Major Depressive Disorde,"""I suffer from GAD, what is depression, anxiet...",9,15-Jul-15,6
144723,138389,Phentermine,Weight Loss,"""I posted a thread on January 31, 2014. I weig...",10,7-Feb-14,41
53295,195554,Aviane,Birth Control,"""I tried taking Junel Fe for a month and it wa...",10,25-Jan-12,3


In [None]:
patient_review_test.sample(10)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
49283,106169,Oxymetazoline,Nasal Congestion,"""I tried this product as a result of a bad day...",2,19-Aug-09,11
26269,65812,Propranolol,mance Anxiety,"""I am absolutely amazed at how well this medic...",10,12-Feb-16,18
16425,226526,Etonogestrel,Birth Control,"""Had this placed a two and a half years ago. P...",7,26-Jun-15,2
49630,11761,Trulicity,"Diabetes, Type 2","""I can agree that the fatigue is noticeable. M...",8,15-Jun-16,17
33551,5373,Cryselle,Birth Control,"""I&#039;m 18 and have been on Cryselle for aro...",8,24-Feb-17,6
23929,165142,Phentermine / topiramate,Weight Loss,"""I have only been using Qsymia for one week bu...",10,15-Aug-14,66
41269,123958,Skyla,Birth Control,"""Insertion process was extremely painful (brok...",3,5-Aug-16,5
51479,161869,Ciprofloxacin,Kidney Infections,"""I was put on Cipro for an acute kidney infect...",2,27-Aug-17,2
21314,155466,Metronidazole,Bacterial Infection,"""I am currently on this to treat BV, I&#039;m ...",8,11-May-15,10
41754,224058,Aspirin / carisoprodol / codeine,Muscle Spasm,"""I was being plagued by severe coughing attack...",9,25-Feb-09,23


## **Dataset Rows And Columns Count**

In [None]:
# Train Dataset Rows and Columns Count
print(patient_review_train.shape)

(161297, 7)


* In Training Data: 161297 onservations and 7 features are available.

In [None]:
# Test Dataset Rows and Columns Count
print(patient_review_test.shape)

(53766, 7)


* In Test Data: 53766 onservations and 7 features are available.

## **Dataset Information**

In [None]:
# train dataset information
print(patient_review_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     161297 non-null  int64 
 1   drugName     161297 non-null  object
 2   condition    160398 non-null  object
 3   review       161297 non-null  object
 4   rating       161297 non-null  int64 
 5   date         161297 non-null  object
 6   usefulCount  161297 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 8.6+ MB
None


* 2 Features has a int64 data type

* 5 Fetaures has a object data type

In [None]:
# test dataset information
print(patient_review_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53766 entries, 0 to 53765
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniqueID     53766 non-null  int64 
 1   drugName     53766 non-null  object
 2   condition    53471 non-null  object
 3   review       53766 non-null  object
 4   rating       53766 non-null  int64 
 5   date         53766 non-null  object
 6   usefulCount  53766 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 2.9+ MB
None


## **Checking Duplicate Values**

In [None]:
# check duplicate values for training data

patient_review_train.duplicated().value_counts()

False    161297
Name: count, dtype: int64

* No duplicate values are present in our data set.

In [None]:
# check duplicate values for test data

patient_review_test.duplicated().value_counts()

False    53766
Name: count, dtype: int64

* No duplicate values are present in our test data set.

## **Checking Missing/Null Values**

In [None]:
# checking missing values on train dataset

print(patient_review_train.isnull().sum())

uniqueID         0
drugName         0
condition      899
review           0
rating           0
date             0
usefulCount      0
dtype: int64


* 899 missing values are present in our training data set.

In [None]:
# checking missing values on test dataset
print(patient_review_test.isnull().sum())

uniqueID         0
drugName         0
condition      295
review           0
rating           0
date             0
usefulCount      0
dtype: int64


* 295 missing values are present in our training data set.

In [None]:
# count percentage wise missing values dataset
def missing_value_check(df):

  per_miss_values = patient_review_train.isnull().sum() * 100 / len(patient_review_train)
  miss_values_df = pd.DataFrame({'Feature_name':patient_review_train.columns,
                                 'percentage':per_miss_values})
  return miss_values_df.sort_values('percentage',ascending=False)

In [None]:
missing_value_check(patient_review_train)

Unnamed: 0,Feature_name,percentage
condition,condition,0.557357
uniqueID,uniqueID,0.0
drugName,drugName,0.0
review,review,0.0
rating,rating,0.0
date,date,0.0
usefulCount,usefulCount,0.0


* Here 0.55% values are missing, Imputation might be a reasonable approach,so i will apply mode imputation.

## **Handling Missing Values**

In [None]:
# handling missing value in training data

patient_review_train = patient_review_train.fillna(patient_review_train['condition'].mode().iloc[0])

In [None]:
# handling missing value in test data

patient_review_test = patient_review_test.fillna(patient_review_test['condition'].mode().iloc[0])

In [None]:
# checking missing values on train dataset
print(patient_review_train.isnull().sum())

uniqueID       0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64


In [None]:
# checking missing values on test dataset
print(patient_review_test.isnull().sum())

uniqueID       0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64


* No missing values are found after imputation.

## **Data Cleaning**

In [None]:
# Define the conditions to keep
keep_conditions = ["Depression", "High Blood Pressure", "Diabetes, Type 2"]

# Filter the DataFrame to only keep the records with the specified conditions
review_train_df = patient_review_train[patient_review_train['condition'].isin(keep_conditions)]

review_train_df.drop(['uniqueID'],axis =1,inplace=True)

review_train_df.drop(['date'],axis =1,inplace=True)

review_train_df.head()

Unnamed: 0,drugName,condition,review,rating,usefulCount
11,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,54
31,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,3
44,Venlafaxine,Depression,"""my gp started me on Venlafaxine yesterday to ...",4,3
50,Dulaglutide,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l...",10,24
67,Effexor XR,Depression,"""This medicine saved my life. I was at my wits...",10,166


In [None]:
# Define the conditions to keep
keep_conditions = ["Depression", "High Blood Pressure", "Diabetes, Type 2"]

# Filter the DataFrame to only keep the records with the specified conditions
review_test_df = patient_review_test[patient_review_test['condition'].isin(keep_conditions)]

review_test_df.drop(['date'],axis =1,inplace=True)

review_test_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,22
35,101149,Actos,"Diabetes, Type 2","""Have been on Actos for almost a year, gained ...",4,42
38,141462,Escitalopram,Depression,"""I am a 22 year old female college student. I ...",9,32
67,201582,Zoloft,Depression,"""Zoloft did not help me at all. I was on it f...",1,51
73,131683,Effexor XR,Depression,"""Sadly only lasted 5 days on Effexor XR. The s...",1,18


## **Text Preprocessing**

In [None]:
review_train_df['review']

11        "I have taken anti-depressants for years, with...
31        "1 week on Zoloft for anxiety and mood swings....
44        "my gp started me on Venlafaxine yesterday to ...
50        "Hey Guys,  It&#039;s been 4 months since my l...
67        "This medicine saved my life. I was at my wits...
                                ...                        
161251    "It is fourth blood pressure pill for me. It f...
161258    "While on Bystolic my feet and arms were numb....
161278    "I just got diagnosed with type 2. My doctor p...
161286    "This is the third med I&#039;ve tried for anx...
161290    "I have only been on Tekturna for 9 days. The ...
Name: review, Length: 13944, dtype: object

### **Stop words**

In [None]:
# make a object of stopwords
stop = stopwords.words('english')

### **Lemmatization**

In [None]:
# make a object of wordnetlemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def text_pre_preprocessing(review):
  # 1 remove html
  review_text = BeautifulSoup(review,'html.parser').get_text()

  # 2 make a space
  letters_only = re.sub('[^a-zA-Z]',' ',review_text)

  # 3 lower letters
  words = letters_only.lower().split()

  # 4 remove stop words
  meaningful_words = [w for w in words if not w in stop]

  # 5 apply lemmatization
  lemmatize_words = [lemmatizer.lemmatize(w) for w in meaningful_words]

  return (' '.join(lemmatize_words))

In [None]:
# apply pro-proceesing on train data
review_train_df['review'] = review_train_df['review'].apply(lambda x: text_pre_preprocessing(x))

In [None]:
# apply pre-processing on test data
review_test_df['review'] = review_test_df['review'].apply(lambda x: text_pre_preprocessing(x))

In [None]:
# convert target variable into numeric labels for training data

review_train_df['condition'] = review_train_df.condition.apply(lambda x:0 if x=="Depression" else(1 if x=="High Blood Pressure" else 2))

In [None]:
# convert target variable into numeric labels for test data
review_test_df['condition'] = review_test_df.condition.apply(lambda x:0 if x=="Depression" else(1 if x=="High Blood Pressure" else 2))

In [None]:
from gensim.models import Word2Vec
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Step 1: Preprocess the text data and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_train_df['review'])
sequences = tokenizer.texts_to_sequences(review_train_df['review'])

MAX_SEQUENCE_LENGTH = 100  # Choose an appropriate sequence length based on your data
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Step 2: Train Word2Vec embeddings or load pre-trained Word2Vec embeddings
# Example of training Word2Vec embeddings
word2vec_model = Word2Vec(sentences=sequences, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Map words to their corresponding Word2Vec embeddings
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))  # Assuming vector size is 100

for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Step 4: Build and compile the Bidirectional LSTM RNN model
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(
    num_words,
    100,  # Assuming vector size is 100
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)
x = embedding_layer(input_)
x = Bidirectional(LSTM(15, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(1, activation="sigmoid")(x)

model = Model(input_, output)
model.compile(
  loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          1212600   
                                                                 
 bidirectional (Bidirection  (None, 100, 30)           13920     
 al)                                                             
                                                                 
 global_max_pooling1d (Glob  (None, 30)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 1226551 (4.68 MB)
Trainable params: 13951 (54.5

In [None]:
print(sequences)

[[101, 228, 264, 6, 280, 629, 693, 69, 4, 346, 76, 48, 16, 124, 629, 60, 225, 604, 100, 173, 280, 1633, 88, 22, 4511, 5676, 108, 108, 173, 7, 8, 10, 9, 14, 85, 234, 17, 12, 114, 144, 51, 4, 3], [5, 70, 15, 73, 410, 16, 1, 82, 872, 54, 2, 32, 615, 5, 37, 23, 2217, 682, 8, 1608, 826, 9, 29, 22, 179, 116, 400, 1053], [685, 11, 754, 569, 42, 7, 131, 92, 8, 34, 226, 128, 237, 987, 56, 167, 184, 233, 4, 3, 30, 23, 22, 39, 183, 32, 75, 43, 777, 731, 226, 419, 1041, 56, 64, 1054, 23, 44, 9048, 9, 22, 82, 39, 183, 32, 26, 1691, 217, 248, 1041, 226, 9049, 94, 685, 76, 182, 34, 14, 203, 71, 686, 40, 1, 71, 1, 165, 40, 149, 22], [2330, 1609, 10, 46, 75, 473, 275, 123, 10, 94, 64, 24, 266, 531, 10, 185, 527, 592, 10, 336, 188, 630, 3208, 1583, 133, 615, 197, 4, 3, 112, 35, 57, 28, 89, 17, 11, 266, 8, 1919, 32, 4, 3, 28, 126, 89, 52, 61, 1372, 1433, 27, 185, 266, 5, 94, 258, 1232, 401, 640, 1076, 4512, 1355, 307, 2331], [21, 312, 18, 2560, 351, 228, 264, 694, 123, 19, 133, 87, 97, 1085, 13, 432, 565

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

# Tokenize the text data
tokenized_reviews = [word_tokenize(review.lower()) for review in review_train_df['review']]

# Train Word2Vec embeddings (or use pre-trained embeddings)
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Convert each review to a vector representation
review_vectors = []
for tokens in tokenized_reviews:
    # Compute the vector representation for each token
    token_vectors = []
    for token in tokens:
        if token in word2vec_model.wv:
            token_vectors.append(word2vec_model.wv[token])
    if token_vectors:
        # Aggregate token vectors (e.g., by averaging)
        review_vectors.append(np.mean(token_vectors, axis=0))
    else:
        # If no word in the review is in the vocabulary, use zero vector
        review_vectors.append(np.zeros(word2vec_model.vector_size))

# Convert review_vectors to numpy array
review_vectors = np.array(review_vectors)


In [None]:
from keras.layers import Input, Dense, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(review_vectors, review_train_df['condition'], test_size=0.25, random_state=42)

# Convert class labels to one-hot encoded vectors
num_classes = len(review_train_df['condition'].unique())

# Define the input shape (dimension of the vectors)
input_shape = (X_train.shape[1], 1)  # Assuming you have vectors of shape (100, 1)

# Define the architecture of the Bidirectional LSTM model
input_layer = Input(shape=input_shape)
x = Bidirectional(LSTM(128, return_sequences=True))(input_layer)
x = GlobalMaxPooling1D()(x)
output_layer = Dense(num_classes, activation='softmax')(x)  # Use softmax activation for multi-class classification

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model with categorical cross-entropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, batch_size=100, epochs=30, validation_split=0.2,callbacks=[early_stopping])

# Evaluate the model on the testing data
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100, 1)]          0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 100, 256)          133120    
 onal)                                                           
                                                                 
 global_max_pooling1d_1 (Gl  (None, 256)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_1 (Dense)             (None, 3)                 771       
                                                                 
Total params: 133891 (523.01 KB)
Trainable params: 133891 (523.01 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/30
Epoch 2/

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import plot_confusion_matrix

# # Assuming 'pred' contains the predicted labels
# cm = confusion_matrix(y_test, y_pred)

# # Plot confusion matrix
# plt.figure(figsize=(8, 6))
# plot_confusion_matrix(model, X_test, y_test, display_labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
# plt.title('Confusion Matrix')
# plt.show()import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

# Assuming 'pred' contains the predicted labels
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(model, X_test, y_test, display_labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.title('Confusion Matrix')
plt.show()



ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/usr/local/lib/python3.10/dist-packages/sklearn/metrics/__init__.py)