In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder



## Loading the Dataset

We load the drug review dataset from an Excel file using `pandas.read_excel()`.

In [24]:
main=pd.read_excel("drugsCom_raw.xlsx")

We filter the dataset to include only reviews related to the following conditions:
- Depression
- High Blood Pressure
- Diabetes, Type 2

In [25]:
df=main[main["condition"].isin(["Depression","High Blood Pressure","Diabetes, Type 2"])]

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
11,75612,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,2017-03-09,54
31,96233,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,2011-05-07,3
44,121333,Venlafaxine,Depression,"""my gp started me on Venlafaxine yesterday to ...",4,2016-04-27,3
50,156544,Dulaglutide,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l...",10,2017-10-24,24
67,131909,Effexor XR,Depression,"""This medicine saved my life. I was at my wits...",10,2013-06-20,166


In [27]:
df.groupby("drugName")["rating"].mean()

drugName
Abilify         6.591549
Acarbose        1.000000
Acebutolol      9.500000
ActoPlus Met    6.500000
Actos           4.863636
                  ...   
Zestoretic      8.333333
Zestril         6.000000
Ziac            5.700000
Zoloft          7.281139
Zyprexa         5.785714
Name: rating, Length: 334, dtype: float64

average rating for each condition

# Apply both stopword removal and special character cleaning 

In [28]:


# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Setup tools
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_review_with_stop_and_stem(text):
    # Convert to string and lowercase
    text = str(text).lower()

    # Remove special characters and stopwords
    words = [word for word in text.split() if word not in stop]
    text_no_specials = re.sub(r'[^a-z0-9\s]', '', ' '.join(words))

    # Lemmatize and Stem each word
    processed_words = []
    for word in text_no_specials.split():
        # Lemmatize the word
        lemma = lemmatizer.lemmatize(word)
        # Stem the lemmatized word
        stemmed = stemmer.stem(lemma)
        processed_words.append(stemmed)

    return ' '.join(processed_words)

# Apply to DataFrame column
df['cleaned'] = df['review'].apply(clean_review_with_stop_and_stem)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91955\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91955\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\91955\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


join all reviews into a single text and convert to lowercase

In [29]:



all_reviews_text = ' '.join(df['cleaned']).lower()
words = re.findall(r'\b\w+\b', all_reviews_text)
words = [word for word in words if word not in stop]

word_counts = Counter(words)
top_common_words = word_counts.most_common(20)

top_words_df = pd.DataFrame(top_common_words, columns=['Word', 'Frequency'])


Top 20 Most Frequent Words (Overall)

Filtering Rows Containing the Number "39" in Cleaned Text

In [30]:
df[df['cleaned'].str.contains(r'\b39\b', regex=True, na=False)].head()


Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,cleaned
7006,181211,Prozac,Depression,"""I used prozac in the past for 3-9 months twic...",6,2017-03-24,31,i use prozac past 39 month twice side effect a...
11968,194244,Nebivolol,High Blood Pressure,"""I&#039;m 39 years old and I&#039;ve been expe...",8,2012-02-02,32,i039m 39 year old i039v experi high bp month 1...
12101,12117,Azilsartan medoxomil,High Blood Pressure,"""It is a bit pricey but has been very effectiv...",10,2013-06-07,24,it bit pricey effect keep blood pressur check ...
56518,86563,Invokana,"Diabetes, Type 2","""I am a 39 year old female who has had type tw...",10,2015-05-06,82,i 39 year old femal type two diabeti 15 year l...
71333,34355,Bystolic,High Blood Pressure,"""I&#039;m 39 years old and I&#039;ve been expe...",8,2012-02-02,32,i039m 39 year old i039v experi high bp month 1...


In [31]:
df[df['cleaned'].str.contains(r'\b39\b', regex=True, na=False)].shape


(16, 8)

After cleaning the text, some entries may have "I039m" or "39 years". We remove "I039m" because it's a mistake, but keep "39 years" because it’s useful.

In [32]:
df['cleaned'] = df['cleaned'].apply(lambda x: ' '.join([word for word in str(x).split() if not (('39' in word) and word != '39' and not word.isdigit())]))


After removing I039m

In [33]:
df[df['cleaned'].str.contains(r'\b39\b', regex=True, na=False)].head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,cleaned
7006,181211,Prozac,Depression,"""I used prozac in the past for 3-9 months twic...",6,2017-03-24,31,i use prozac past 39 month twice side effect a...
11968,194244,Nebivolol,High Blood Pressure,"""I&#039;m 39 years old and I&#039;ve been expe...",8,2012-02-02,32,39 year old experi high bp month 190140 effexo...
12101,12117,Azilsartan medoxomil,High Blood Pressure,"""It is a bit pricey but has been very effectiv...",10,2013-06-07,24,it bit pricey effect keep blood pressur check ...
56518,86563,Invokana,"Diabetes, Type 2","""I am a 39 year old female who has had type tw...",10,2015-05-06,82,i 39 year old femal type two diabeti 15 year l...
71333,34355,Bystolic,High Blood Pressure,"""I&#039;m 39 years old and I&#039;ve been expe...",8,2012-02-02,32,39 year old experi high bp month 190140 effexo...


In [34]:
reviews_text_combined = ' '.join(df['cleaned']).lower()
extracted_words = re.findall(r'\b\w+\b', reviews_text_combined)
filtered_words = [word for word in extracted_words if word not in stop]

word_frequency = Counter(filtered_words)
top_words = word_frequency.most_common(20)

top_words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])


In [35]:
# pip install vaderSentiment

Classifying Reviews as Positive or Negative:

In [36]:


analyzer = SentimentIntensityAnalyzer()


def classify_sentiment(review):
  
    sentiment_score = analyzer.polarity_scores(review)['compound']
    
    # Classify the review based on the score
    if sentiment_score >= 0.05:
        return 'Positive'
    elif sentiment_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'


df['sentiment'] = df['cleaned'].apply(classify_sentiment)


df[['cleaned', 'sentiment']].head()


Unnamed: 0,cleaned,sentiment
11,i taken antidepress year improv mostli moder s...,Negative
31,1 week zoloft anxieti mood swing take 50mg mor...,Positive
44,my gp start venlafaxin yesterday help depress ...,Positive
50,hey guy 4 month sinc last post want give month...,Positive
67,thi medicin save life wit end antidepress read...,Positive


In [37]:


X = df['cleaned']
y = df['condition']
le = LabelEncoder()
y_encoded = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)







In [38]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])


param_grid = {
    # TF-IDF Hyperparameters
    'tfidf__max_features': [1000000],
    'tfidf__ngram_range': [ (1,1)],
    # Logistic Regression Hyperparameters
    'clf__C': [ 10],
}



grid_log= GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_log.fit(X_train, y_train)

y_pred_log= grid_log.predict(X_test)
y_train_pred_log=grid_log.predict(X_train)

print("Best Parameters:", grid_log.best_params_)
print("test Accuracy:", accuracy_score(y_test, y_pred_log))
print("training accuracy",accuracy_score(y_train,y_train_pred_log))

print("\nClassification Report:\n", classification_report(y_test, y_pred_log))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'clf__C': 10, 'tfidf__max_features': 1000000, 'tfidf__ngram_range': (1, 1)}
test Accuracy: 0.969881678020796
training accuracy 0.9965038099506948

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1800
           1       0.98      0.94      0.96       501
           2       0.96      0.91      0.94       488

    accuracy                           0.97      2789
   macro avg       0.97      0.95      0.96      2789
weighted avg       0.97      0.97      0.97      2789



In [39]:
def predict_condition(review):
    cleaned_review = clean_review_with_stop_and_stem(review)
    predicted_label = grid_log.predict([cleaned_review])[0]  # raw text goes directly here
    predicted_condition = le.inverse_transform([predicted_label])[0]
    return predicted_condition


new_review = "I've been feeling very down lately and have lost interest in activities I used to enjoy."
predicted_condition = predict_condition(new_review)
print(f"Predicted condition for the review: {predicted_condition}")

Predicted condition for the review: Depression


In [40]:
df[df["Unnamed: 0"]==103458]

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,cleaned,sentiment
161290,103458,Tekturna,High Blood Pressure,"""I have only been on Tekturna for 9 days. The ...",7,2010-02-07,18,i tekturna 9 day effect immedi also calcium ch...,Negative


In [41]:
review=df[df["Unnamed: 0"]==103458].review
print(f"Predicted condition for the review: {predict_condition(review)}")

Predicted condition for the review: High Blood Pressure


In [42]:
import pickle

with open("grid_log.pkl", "wb") as f:
    pickle.dump(grid_log, f)

with open("le.pkl", "wb") as f:
    pickle.dump(le, f)

print("✅ Saved model.pkl and label_encoder.pkl")


✅ Saved model.pkl and label_encoder.pkl


In [43]:
import re
import dill
import pickle
import nltk

from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load model and label encoder
with open("grid_log.pkl", "rb") as f:
    grid_log = pickle.load(f)

with open("le.pkl", "rb") as f:
    le = pickle.load(f)

# Clean review function
def clean_review_with_stop_and_stem(text):
    stop_words = set(nltk_stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    text = str(text).lower()
    words = [word for word in text.split() if word not in stop_words]
    text_no_specials = re.sub(r'[^a-z0-9\s]', '', ' '.join(words))

    processed_words = []
    for word in text_no_specials.split():
        lemma = lemmatizer.lemmatize(word)
        stemmed = stemmer.stem(lemma)
        processed_words.append(stemmed)

    return ' '.join(processed_words)

# Prediction function
def predict_condition(review):
    cleaned_review = clean_review_with_stop_and_stem(review)
    predicted_label = grid_log.predict([cleaned_review])[0]
    predicted_condition = le.inverse_transform([predicted_label])[0]
    return predicted_condition

# Save functions to a file
functions = {
    "clean_review_with_stop_and_stem": clean_review_with_stop_and_stem,
    "predict_condition": predict_condition
}

with open("functions.pkl", "wb") as f:
    dill.dump(functions, f)

print("✅ Functions saved as functions.pkl")


✅ Functions saved as functions.pkl


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91955\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91955\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\91955\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
