Using the drug-review dataset found at https://www.kaggle.com/datasets/mohamedabdelwahabali/drugreview?resource=download

In [14]:
#pip install numpy pandas nltk cupy scikit-learn spacy
#python -m spacy download en_core_web_sm

In [15]:
import numpy as np
import pandas as pd


train_raw = pd.read_csv("archive/drug_review_train.csv", usecols=["review", "rating"]).to_numpy()
test_raw = pd.read_csv("archive/drug_review_test.csv", usecols=["review", "rating"]).to_numpy()
val_raw = pd.read_csv("archive/drug_review_validation.csv", usecols=["review", "rating"]).to_numpy()

train_raw = train_raw[:11000]       #using 10% of dataset, maintaing rough ratios 
test_raw = test_raw[:4600]
val_raw = test_raw[:2700]

print(train_raw.shape)
print(test_raw.shape)
print(val_raw.shape)

(11000, 2)
(4600, 2)
(2700, 2)


In [16]:
def seperate_ratings_and_text(data):
    text = data[:,0]
    ratings = data[:,1]
    return text,ratings


train_raw_text,train_raw_ratings = seperate_ratings_and_text(train_raw)
test_raw_text,test_raw_ratings = seperate_ratings_and_text(test_raw)
val_raw_text,val_raw_ratings = seperate_ratings_and_text(val_raw)

print(train_raw_text[0])
print(train_raw_ratings[0])

"i have used restasis for about a year now and have seen almost no progress.  for most of my life i've had red and bothersome eyes. after trying various eye drops, my doctor recommended restasis.  he said it typically takes 3 to 6 months for it to really kick in but it never did kick in.  when i put the drops in it burns my eyes for the first 30 - 40 minutes.  i've talked with my doctor about this and he said it is normal but should go away after some time, but it hasn't. every year around spring time my eyes get terrible irritated  and this year has been the same (maybe even worse than other years) even though i've been using restasis for a year now. the only difference i notice was for the first couple weeks, but now i'm ready to move on."
2.0


In [34]:
import re

def clean_review(review):
    review = re.sub(r"[^a-zA-Z0-9\s]", "", review)
    review = re.sub(r"\s+", " ", review) 
    review = review.lower()
    return review

def clean_text(text_list):
    cleaned_sentences = [clean_review(sentence) for sentence in text_list]
    return np.array(cleaned_sentences)



train_cleaned = clean_text(train_raw_text)
test_cleaned = clean_text(test_raw_text)
val_cleaned = clean_text(val_raw_text)

print(train_cleaned[0])

i have used restasis for about a year now and have seen almost no progress for most of my life ive had red and bothersome eyes after trying various eye drops my doctor recommended restasis he said it typically takes 3 to 6 months for it to really kick in but it never did kick in when i put the drops in it burns my eyes for the first 30 40 minutes ive talked with my doctor about this and he said it is normal but should go away after some time but it hasnt every year around spring time my eyes get terrible irritated and this year has been the same maybe even worse than other years even though ive been using restasis for a year now the only difference i notice was for the first couple weeks but now im ready to move on


In [53]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words("english"))

#PoS mapping
pos_mapping = {
    "NOUN": "n",
    "PROPN": "n",
    "VERB": "v",
    "AUX": "v",
    "ADJ": "a",
    "ADV": "r",
}

#cache dictionary for lemmatized words with PoS
lemmatized_cache = {}

def lemmatize_word(word, pos):

    key = (word, pos)  #use word and PoS as the key
    if key in lemmatized_cache:
        return lemmatized_cache[key]

    #compute lemmatized form and cache it
    lemmatized_word = lemmatizer.lemmatize(word, pos_mapping.get(pos, "n"))
    lemmatized_cache[key] = lemmatized_word
    return lemmatized_word

def better_lemmatizer(single_sentence):

    #clean the sentence (assuming clean_review is defined)
    single_sentence = clean_review(single_sentence)

    #get PoS tags for the sentence
    text_PoS = nlp(single_sentence)

    #remove stopwords, lemmatize
    lemmatized_list = [
        lemmatize_word(token.text.lower(), token.pos_)
        for token in text_PoS
        if token.text.lower() not in stopWords
    ]

    return lemmatized_list

def lemmatize(data):

    res = []
    for i in range(len(data)):
        res.append(better_lemmatizer(data[i]))

    return res


# Example usage
train_lem = lemmatize(train_cleaned)  
test_lem = lemmatize(test_cleaned)   
val_lem = lemmatize(val_cleaned)    




In [66]:
train_lem_str = [" ".join(row) for row in train_lem]
test_lem_str = [" ".join(row) for row in test_lem]
val_lem_str = [" ".join(row) for row in val_lem]

print(train_lem_str[0:2])
print(len(train_lem_str))
print(test_lem_str[0:2])
print(len(test_lem_str))

['use restasis year see almost progress life red bothersome eye try various eye drop doctor recommend restasis say typically take 3 6 month really kick never kick put drop burn eye first 30 40 minute talk doctor say normal go away time nt every year around spring time eye get terrible irritated year maybe even bad year even though use restasis year difference notice first couple week ready move', 'experience somewhat mixed use implanon nearly 14 month decide get remove bleed every day day would occasionally stain underwear sheet nt start way first month nt bleed epic two week period everything irregular basically new norm sadly decide get rid implanon endless bleeding mention bleeding usually pretty light bit spot quite period either endless bleeding pretty side effect free except minor acne nt get pregnant yeah butblood lot blood']
11000
['try antidepressant year citalopram fluoxetine amitriptyline none help depression insomnia anxiety doctor suggest change onto 45 mg mirtazapine medi

In [75]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3),max_features=10000)

train_count = vectorizer.fit_transform(train_lem_str)
test_count = vectorizer.transform(test_lem_str)

val_count = vectorizer.transform(val_lem_str)

In [76]:
print(vectorizer.get_feature_names_out())
print(train_count.shape)
print(test_count.shape)
print(train_count[0])

['01' '025' '025 mg' ... 'zpack' 'zyban' 'zyprexa']
(11000, 10000)
(4600, 10000)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 69 stored elements and shape (1, 10000)>
  Coords	Values
  (0, 9207)	0.1042577771129171
  (0, 9856)	0.20895476155025433
  (0, 7417)	0.06812218393910141
  (0, 579)	0.06740122310013714
  (0, 6926)	0.13427023398369822
  (0, 4700)	0.059727594115233734
  (0, 7136)	0.10410217339224248
  (0, 1262)	0.1493161721931269
  (0, 2951)	0.3913643797244858
  (0, 9021)	0.05320613583386595
  (0, 9283)	0.1317811385664397
  (0, 2437)	0.19855999080920436
  (0, 2292)	0.10777820986934099
  (0, 7114)	0.06726768185103153
  (0, 7345)	0.12043457719642042
  (0, 9127)	0.13903016759772524
  (0, 8350)	0.031731561475911566
  (0, 5498)	0.04208822844772661
  (0, 7061)	0.060446339351779454
  (0, 4510)	0.20940658763481726
  (0, 5799)	0.060609605425929594
  (0, 6976)	0.0673057462079806
  (0, 1342)	0.09554670890043439
  (0, 3208)	0.09446890792220707
  (0, 225)	0.08042347294222574
  :

In [79]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model = MultinomialNB()
model.fit(train_count, train_raw_ratings.astype(int))

test_predictions = model.predict(test_count)

print("Test Set Evaluation:")
print("Accuracy:", accuracy_score(test_raw_ratings.astype(int), test_predictions))
print("\nClassification Report:")
print(classification_report(test_raw_ratings.astype(int), test_predictions))

Test Set Evaluation:
Accuracy: 0.3393478260869565

Classification Report:
              precision    recall  f1-score   support

           1       0.57      0.22      0.32       617
           2       0.00      0.00      0.00       207
           3       0.00      0.00      0.00       205
           4       0.00      0.00      0.00       133
           5       0.00      0.00      0.00       228
           6       0.00      0.00      0.00       159
           7       0.00      0.00      0.00       262
           8       0.00      0.00      0.00       540
           9       0.26      0.03      0.05       834
          10       0.33      0.99      0.49      1415

    accuracy                           0.34      4600
   macro avg       0.12      0.12      0.09      4600
weighted avg       0.23      0.34      0.20      4600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
