This project compares the performance of theory-driven and data-driven methods for classification of reviews into three classes - positive, negative and neutral. In particular between Naive Bayes using lemmatisation, bigrams and trigrams AND fine tuned BERT model (encoder).

Using the drug-review dataset found at https://www.kaggle.com/datasets/mohamedabdelwahabali/drugreview?resource=download

In [28]:
#pip install numpy pandas nltk cupy scikit-learn spacy transformers datasets torch
#python -m spacy download en_core_web_sm

# Data Pre Processing

In [29]:
import numpy as np
import pandas as pd


train_raw = pd.read_csv("archive/drug_review_train.csv", usecols=["review", "rating"]).to_numpy()
test_raw = pd.read_csv("archive/drug_review_test.csv", usecols=["review", "rating"]).to_numpy()
val_raw = pd.read_csv("archive/drug_review_validation.csv", usecols=["review", "rating"]).to_numpy()

np.random.seed(50)

np.random.shuffle(train_raw)
np.random.shuffle(test_raw)
np.random.shuffle(val_raw)

train_raw = train_raw[:8000]       #Using a 80:10:10 split (a subset of the data)
test_raw = test_raw[:1000]  
val_raw = val_raw[:1000]  

print(train_raw.shape)
print(test_raw.shape)
print(val_raw.shape)

(8000, 2)
(1000, 2)
(1000, 2)


In [30]:
def seperate_ratings_and_text(data):
    text = data[:,0]
    ratings = data[:,1]
    return text,ratings


train_raw_text,train_raw_ratings = seperate_ratings_and_text(train_raw)
test_raw_text,test_raw_ratings = seperate_ratings_and_text(test_raw)
val_raw_text,val_raw_ratings = seperate_ratings_and_text(val_raw)


def convert_rating_to_sentiment(rating_list):           # 1 -> pos, 0 -> neut,-1 -> neg sentiment

    for i in range(len(rating_list)):
        rating = int(rating_list[i] )
        if rating >= 7:
            rating_list[i] = 1
        elif rating <= 4:
            rating_list[i] = -1
        else:
            rating_list[i] = 0

    return rating_list

convert_rating_to_sentiment(train_raw_ratings)
convert_rating_to_sentiment(test_raw_ratings)
convert_rating_to_sentiment(val_raw_ratings)

print(train_raw_text[0])
print(train_raw_ratings[0:100])

"reporting in. after 5 weeks of stopping the anastrazole, my joint pain was pretty much gone and i was ecstatic! due to serious family illness, i had to postpone my follow up visit with my oncologist. fearing that i had been off the med long enough, i started taking it again. that was a week ago. the joint pain came back today with a vengeance  - aching knees, aching back, shortness of breath, barely able to walk. i cannot take this med anymore and it worries me as the alternatives (tamoxifen) are not as effective. see my doc in five days. will post again. my heart goes out to all the women posting on this forum."
[0 1 1 -1 -1 1 1 0 1 -1 1 -1 1 1 1 1 -1 0 1 -1 -1 1 0 -1 0 -1 1 -1 -1 0 1
 1 0 1 1 -1 -1 1 1 1 0 1 1 1 0 -1 1 1 1 1 1 1 1 1 1 1 0 -1 1 1 1 -1 1 1 1
 1 -1 1 -1 1 1 1 0 -1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 -1 0 -1 -1 -1 1 -1 1 1 1
 1 -1]


In [31]:
import re

def clean_review(review):                                       #keeping only alphanumeric words
    review = re.sub(r"[^a-zA-Z0-9\s]", "", review)
    review = re.sub(r"\s+", " ", review) 
    review = review.lower()
    return review

def clean_text(text_list):
    cleaned_sentences = [clean_review(sentence) for sentence in text_list]
    return np.array(cleaned_sentences)



train_cleaned = clean_text(train_raw_text)
test_cleaned = clean_text(test_raw_text)
val_cleaned = clean_text(val_raw_text)

print(train_cleaned[0])

reporting in after 5 weeks of stopping the anastrazole my joint pain was pretty much gone and i was ecstatic due to serious family illness i had to postpone my follow up visit with my oncologist fearing that i had been off the med long enough i started taking it again that was a week ago the joint pain came back today with a vengeance aching knees aching back shortness of breath barely able to walk i cannot take this med anymore and it worries me as the alternatives tamoxifen are not as effective see my doc in five days will post again my heart goes out to all the women posting on this forum


The lemmatizer needs the PoS of the word to properly lemmatize

In [32]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words("english"))

#PoS mapping
pos_mapping = {
    "NOUN": "n",
    "PROPN": "n",
    "VERB": "v",
    "AUX": "v",
    "ADJ": "a",
    "ADV": "r",
}

#cache dictionary for lemmatized words with PoS
lemmatized_cache = {}

def lemmatize_word(word, pos):

    key = (word, pos)  #use word and PoS as the key
    if key in lemmatized_cache:
        return lemmatized_cache[key]

    #compute lemmatized form and cache it
    lemmatized_word = lemmatizer.lemmatize(word, pos_mapping.get(pos, "n"))
    lemmatized_cache[key] = lemmatized_word
    return lemmatized_word

def better_lemmatizer(single_sentence):

    #clean the sentence (assuming clean_review is defined)
    single_sentence = clean_review(single_sentence)

    #get PoS tags for the sentence
    text_PoS = nlp(single_sentence)

    #remove stopwords, lemmatize
    lemmatized_list = [
        lemmatize_word(token.text.lower(), token.pos_)
        for token in text_PoS
        if token.text.lower() not in stopWords
    ]

    return lemmatized_list

def lemmatize(data):

    res = []
    for i in range(len(data)):
        res.append(better_lemmatizer(data[i]))

    return res


# Example usage
train_lem = lemmatize(train_cleaned)  
test_lem = lemmatize(test_cleaned)   
val_lem = lemmatize(val_cleaned)    




In [33]:
train_lem_str = [" ".join(row) for row in train_lem]
test_lem_str = [" ".join(row) for row in test_lem]
val_lem_str = [" ".join(row) for row in val_lem]

print(train_lem_str[0:2])
print(len(train_lem_str))
print(test_lem_str[0:2])
print(len(test_lem_str))

['report 5 week stop anastrazole joint pain pretty much go ecstatic due serious family illness postpone follow visit oncologist fearing med long enough start take week ago joint pain come back today vengeance ache knee ache back shortness breath barely able walk take med anymore worry alternative tamoxifen effective see doc five day post heart go woman post forum', 'imagine awe use sleep even camp heat storm tent 20 maybe exhausted mountain trek 30 major issue except occasional job stress ex loud snore change 40 within 5 month mom die get divorce stab street robbery sleep sometimes almost impossible read site probably look answer love one prescribe many thing drs repeatedly tell traz best nonaddictive allow stage sleep cycle work great albeit funky dream nextday grogginess never leave many month quit absolutely withdrawal']
8000
['three week ago begin around 20 hot flash per day go night sweat 4 time per night 5 day ago doctor put pristiq father recently pass away several personal thin

Using skikit-learn, we make unigrams, bigrams and trigrams of the lemmatized text and pass these as features for Naive Bayes. Here noun phrases and other methods such as wikification could also have been used to boost features

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,3),max_features=60000)

train_count = vectorizer.fit_transform(train_lem_str)
test_count = vectorizer.transform(test_lem_str)

val_count = vectorizer.transform(val_lem_str)

In [35]:
print(vectorizer.get_feature_names_out())
print(train_count.shape)
print(test_count.shape)
print(train_count[0])

['0025' '005' '005 mg' ... 'zyprexa' 'zyprexa 75' 'zyrtec']
(8000, 60000)
(1000, 60000)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 82 stored elements and shape (1, 60000)>
  Coords	Values
  (0, 44867)	0.10140881982506662
  (0, 57957)	0.07334083521124661
  (0, 51319)	0.047481783295737705
  (0, 3872)	0.14584100101572434
  (0, 25320)	0.17532767623764836
  (0, 38600)	0.08738052674078056
  (0, 42224)	0.06733613946867814
  (0, 33877)	0.049159079163035115
  (0, 19293)	0.06758674857020708
  (0, 11790)	0.1319390090807676
  (0, 11593)	0.06253622579605236
  (0, 46773)	0.08985474692447291
  (0, 14625)	0.08266992103340727
  (0, 23748)	0.10649766934711857
  (0, 41578)	0.13745965835717122
  (0, 16962)	0.078754679761918
  (0, 57581)	0.09347094858679746
  (0, 37692)	0.13446822768017172
  (0, 30436)	0.12453116282612124
  (0, 28462)	0.058056630669080664
  (0, 12493)	0.0710087844988108
  (0, 50090)	0.03624585348647605
  (0, 53063)	0.05173597986660481
  (0, 2596)	0.052039190917262654
  (

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model = MultinomialNB()
model.fit(train_count, train_raw_ratings.astype(int))

val_predictions = model.predict(val_count)

print("Validation Set Evaluation:")
print("Accuracy:", accuracy_score(val_raw_ratings.astype(int), val_predictions))
print("\nClassification Report:")
print(classification_report(val_raw_ratings.astype(int), val_predictions))

test_predictions = model.predict(test_count)

print("Test Set Evaluation:")
print("Accuracy:", accuracy_score(test_raw_ratings.astype(int), test_predictions))
print("\nClassification Report:")
print(classification_report(test_raw_ratings.astype(int), test_predictions))

Validation Set Evaluation:
Accuracy: 0.68

Classification Report:
              precision    recall  f1-score   support

          -1       1.00      0.00      0.01       232
           0       0.00      0.00      0.00        89
           1       0.68      1.00      0.81       679

    accuracy                           0.68      1000
   macro avg       0.56      0.33      0.27      1000
weighted avg       0.69      0.68      0.55      1000

Test Set Evaluation:
Accuracy: 0.681

Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       233
           0       0.00      0.00      0.00        86
           1       0.68      1.00      0.81       681

    accuracy                           0.68      1000
   macro avg       0.23      0.33      0.27      1000
weighted avg       0.46      0.68      0.55      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 Now we use BERT to classify the reviews. This is different to the previous approach as it is not feature based and hence there is no need to find any PoS/Unigrams/Bigrams etc. The model will be trained on the data and the weights will contain all the necessary information regarding the words. 


 Using CUDA for GPU acceleration

In [24]:
import os
print(os.environ.get("CUDA_VISIBLE_DEVICES"))

None


In [18]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version: {torch.version.cuda}")

PyTorch Version: 2.7.0.dev20250117+cu126
CUDA Available: True
CUDA Version: 12.6


In [19]:
import pandas as pd

train_raw_text = np.array(train_raw_text).reshape(-1, 1)
train_raw_ratings = np.array(train_raw_ratings+1).reshape(-1, 1)        #incrementing the rating value by 1 for the model, has no effect on classification
                                                                        # 2 -> pos, 1 -> neut, 0 -> neg sentiment
test_raw_text = np.array(test_raw_text).reshape(-1, 1)
test_raw_ratings = np.array(test_raw_ratings+1).reshape(-1, 1)

val_raw_text = np.array(val_raw_text).reshape(-1, 1)
val_raw_ratings = np.array(val_raw_ratings+1).reshape(-1, 1)

train_concat_raw = np.hstack((train_raw_text, train_raw_ratings))
test_concat_raw = np.hstack((test_raw_text, test_raw_ratings))
val_concat_raw = np.hstack((val_raw_text, val_raw_ratings))

test_df = pd.DataFrame(test_concat_raw,columns=["text","labels"])
train_df = pd.DataFrame(train_concat_raw,columns=["text","labels"])
val_df = pd.DataFrame(val_concat_raw,columns=["text","labels"])

train_dict = train_df.to_dict(orient="list")
test_dict = test_df.to_dict(orient="list")
validation_dict = val_df.to_dict(orient="list")


In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,  TrainingArguments, Trainer
import os
import gc
import torch
from datasets import Dataset

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

gc.collect()

torch.cuda.empty_cache()

train_dataset = Dataset.from_dict(train_dict)
test_dataset = Dataset.from_dict(test_dict)
validation_dataset = Dataset.from_dict(validation_dict)


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 3)
model = model.to("cuda")

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length")

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)


train_dataset = train_dataset.remove_columns([ "text", "token_type_ids"])
test_dataset = test_dataset.remove_columns([ "text", "token_type_ids"])
validation_dataset = validation_dataset.remove_columns([ "text", "token_type_ids"])

print(train_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=5,
    learning_rate=1e-5,
    use_cpu=False,
    seed = 50)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,


)
for batch in trainer.get_train_dataloader():
    print(f"Batch input_ids shape: {batch['input_ids'].shape}")
    print(f"Batch attention_mask shape: {batch['attention_mask'].shape}")
    print(f"Batch labels shape: {batch['labels'].shape}")
    break
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 8000/8000 [00:01<00:00, 7018.74 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6520.04 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6853.90 examples/s]


['labels', 'input_ids', 'attention_mask']
Batch input_ids shape: torch.Size([8, 512])
Batch attention_mask shape: torch.Size([8, 512])
Batch labels shape: torch.Size([8])


Epoch,Training Loss,Validation Loss
1,0.5527,0.45269
2,0.4411,0.503328
3,0.3499,0.548935
4,0.2718,0.625534
5,0.2038,0.659299


TrainOutput(global_step=5000, training_loss=0.3721426010131836, metrics={'train_runtime': 1860.1805, 'train_samples_per_second': 21.503, 'train_steps_per_second': 2.688, 'total_flos': 1.052453670912e+16, 'train_loss': 0.3721426010131836, 'epoch': 5.0})

In [21]:
tokenizer.save_pretrained("./trained_model")
trainer.save_model("./trained_model")

In [25]:
from sklearn.metrics import classification_report

predictions = trainer.predict(validation_dataset)
logits = predictions.predictions
labels = predictions.label_ids
predicted_classes = np.argmax(logits, axis=-1)


print(classification_report(labels,predicted_classes))

              precision    recall  f1-score   support

           0       0.86      0.75      0.80       232
           1       0.33      0.35      0.34        89
           2       0.91      0.94      0.92       679

    accuracy                           0.84      1000
   macro avg       0.70      0.68      0.69      1000
weighted avg       0.84      0.84      0.84      1000



In [26]:


predictions = trainer.predict(test_dataset)
logits = predictions.predictions
labels = predictions.label_ids
predicted_classes = np.argmax(logits, axis=-1)


print(classification_report(labels,predicted_classes))

              precision    recall  f1-score   support

           0       0.76      0.68      0.71       233
           1       0.17      0.21      0.19        86
           2       0.89      0.90      0.90       681

    accuracy                           0.79      1000
   macro avg       0.61      0.60      0.60      1000
weighted avg       0.80      0.79      0.79      1000

