Using the drug-review dataset found at https://www.kaggle.com/datasets/mohamedabdelwahabali/drugreview?resource=download

In [1]:
#pip install numpy pandas nltk cupy scikit-learn spacy transformers datasets
#python -m spacy download en_core_web_sm

In [2]:
import numpy as np
import pandas as pd


train_raw = pd.read_csv("archive/drug_review_train.csv", usecols=["review", "rating"]).to_numpy()
test_raw = pd.read_csv("archive/drug_review_test.csv", usecols=["review", "rating"]).to_numpy()
val_raw = pd.read_csv("archive/drug_review_validation.csv", usecols=["review", "rating"]).to_numpy()

np.random.seed(50)

np.random.shuffle(train_raw)
np.random.shuffle(test_raw)
np.random.shuffle(val_raw)

train_raw = train_raw[:4000]       
test_raw = test_raw[:500]  
val_raw = val_raw[:500]  

print(train_raw.shape)
print(test_raw.shape)
print(val_raw.shape)

(4000, 2)
(500, 2)
(500, 2)


In [3]:
def seperate_ratings_and_text(data):
    text = data[:,0]
    ratings = data[:,1]
    return text,ratings


train_raw_text,train_raw_ratings = seperate_ratings_and_text(train_raw)
test_raw_text,test_raw_ratings = seperate_ratings_and_text(test_raw)
val_raw_text,val_raw_ratings = seperate_ratings_and_text(val_raw)


def convert_rating_to_sentiment(rating_list):

    for i in range(len(rating_list)):
        rating = int(rating_list[i] )
        if rating >= 7:
            rating_list[i] = 1
        elif rating <= 4:
            rating_list[i] = -1
        else:
            rating_list[i] = 0

    return rating_list

convert_rating_to_sentiment(train_raw_ratings)
convert_rating_to_sentiment(test_raw_ratings)
convert_rating_to_sentiment(val_raw_ratings)

print(train_raw_text[0])
print(train_raw_ratings[0:100])

"reporting in. after 5 weeks of stopping the anastrazole, my joint pain was pretty much gone and i was ecstatic! due to serious family illness, i had to postpone my follow up visit with my oncologist. fearing that i had been off the med long enough, i started taking it again. that was a week ago. the joint pain came back today with a vengeance  - aching knees, aching back, shortness of breath, barely able to walk. i cannot take this med anymore and it worries me as the alternatives (tamoxifen) are not as effective. see my doc in five days. will post again. my heart goes out to all the women posting on this forum."
[0 1 1 -1 -1 1 1 0 1 -1 1 -1 1 1 1 1 -1 0 1 -1 -1 1 0 -1 0 -1 1 -1 -1 0 1
 1 0 1 1 -1 -1 1 1 1 0 1 1 1 0 -1 1 1 1 1 1 1 1 1 1 1 0 -1 1 1 1 -1 1 1 1
 1 -1 1 -1 1 1 1 0 -1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 -1 0 -1 -1 -1 1 -1 1 1 1
 1 -1]


In [3]:
import re

def clean_review(review):
    review = re.sub(r"[^a-zA-Z0-9\s]", "", review)
    review = re.sub(r"\s+", " ", review) 
    review = review.lower()
    return review

def clean_text(text_list):
    cleaned_sentences = [clean_review(sentence) for sentence in text_list]
    return np.array(cleaned_sentences)



train_cleaned = clean_text(train_raw_text)
test_cleaned = clean_text(test_raw_text)
val_cleaned = clean_text(val_raw_text)

print(train_cleaned[0])

reporting in after 5 weeks of stopping the anastrazole my joint pain was pretty much gone and i was ecstatic due to serious family illness i had to postpone my follow up visit with my oncologist fearing that i had been off the med long enough i started taking it again that was a week ago the joint pain came back today with a vengeance aching knees aching back shortness of breath barely able to walk i cannot take this med anymore and it worries me as the alternatives tamoxifen are not as effective see my doc in five days will post again my heart goes out to all the women posting on this forum


In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words("english"))

#PoS mapping
pos_mapping = {
    "NOUN": "n",
    "PROPN": "n",
    "VERB": "v",
    "AUX": "v",
    "ADJ": "a",
    "ADV": "r",
}

#cache dictionary for lemmatized words with PoS
lemmatized_cache = {}

def lemmatize_word(word, pos):

    key = (word, pos)  #use word and PoS as the key
    if key in lemmatized_cache:
        return lemmatized_cache[key]

    #compute lemmatized form and cache it
    lemmatized_word = lemmatizer.lemmatize(word, pos_mapping.get(pos, "n"))
    lemmatized_cache[key] = lemmatized_word
    return lemmatized_word

def better_lemmatizer(single_sentence):

    #clean the sentence (assuming clean_review is defined)
    single_sentence = clean_review(single_sentence)

    #get PoS tags for the sentence
    text_PoS = nlp(single_sentence)

    #remove stopwords, lemmatize
    lemmatized_list = [
        lemmatize_word(token.text.lower(), token.pos_)
        for token in text_PoS
        if token.text.lower() not in stopWords
    ]

    return lemmatized_list

def lemmatize(data):

    res = []
    for i in range(len(data)):
        res.append(better_lemmatizer(data[i]))

    return res


# Example usage
train_lem = lemmatize(train_cleaned)  
test_lem = lemmatize(test_cleaned)   
val_lem = lemmatize(val_cleaned)    




KeyboardInterrupt: 

In [6]:
train_lem_str = [" ".join(row) for row in train_lem]
test_lem_str = [" ".join(row) for row in test_lem]
val_lem_str = [" ".join(row) for row in val_lem]

print(train_lem_str[0:2])
print(len(train_lem_str))
print(test_lem_str[0:2])
print(len(test_lem_str))

['report 5 week stop anastrazole joint pain pretty much go ecstatic due serious family illness postpone follow visit oncologist fearing med long enough start take week ago joint pain come back today vengeance ache knee ache back shortness breath barely able walk take med anymore worry alternative tamoxifen effective see doc five day post heart go woman post forum', 'imagine awe use sleep even camp heat storm tent 20 maybe exhausted mountain trek 30 major issue except occasional job stress ex loud snore change 40 within 5 month mom die get divorce stab street robbery sleep sometimes almost impossible read site probably look answer love one prescribe many thing drs repeatedly tell traz best nonaddictive allow stage sleep cycle work great albeit funky dream nextday grogginess never leave many month quit absolutely withdrawal']
4000
['three week ago begin around 20 hot flash per day go night sweat 4 time per night 5 day ago doctor put pristiq father recently pass away several personal thin

In [7]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,3),max_features=60000)

train_count = vectorizer.fit_transform(train_lem_str)
test_count = vectorizer.transform(test_lem_str)

val_count = vectorizer.transform(val_lem_str)

In [8]:
# from nltk import bigrams
# from collections import Counter 

# def get_ngram_columns(processed_list,most_common_threshold = 1000):

#     #function that finds the ngrams of a BoW tokenised list, takes in the list, value of 
#     # n (bigrams/trigrams/4-grams...). returns the most common 1000 ngrams , and ngram_columns(which is redundant)

#     #dictionary of all tokens
#     ngram_counts = Counter()

#     #to find most common, find ngrams of entire corpus then update counts
#     for review_number in range(len(processed_list)):
#         sentence = processed_list[review_number]
#         ngrams = list(bigrams(sentence))
#         ngram_counts.update(ngrams)

#     #get the 1000 most common ones (decided by default variable most_common_threshold)
#     #these will be the columns in the TC matrix
#     most_common_list = list(ngram_counts.most_common(most_common_threshold))

#     return most_common_list

# get_ngram_columns(test_cleaned)

In [9]:
print(vectorizer.get_feature_names_out())
print(train_count.shape)
print(test_count.shape)
print(train_count[0])

['002 mg' '002 mg next' '0025' ... 'zyprexa work' 'zyprexa work well'
 'zyrtec']
(4000, 60000)
(500, 60000)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 81 stored elements and shape (1, 60000)>
  Coords	Values
  (0, 50544)	0.10579050674660305
  (0, 57945)	0.07609240220141587
  (0, 53652)	0.048959601711478146
  (0, 41192)	0.18811509159758982
  (0, 47555)	0.0918217986838576
  (0, 49304)	0.06932999663071683
  (0, 45333)	0.05054767971570731
  (0, 38259)	0.07011211732995874
  (0, 34432)	0.13385947364282724
  (0, 34195)	0.06542240289377231
  (0, 51445)	0.08980352521653046
  (0, 35980)	0.08774028596511059
  (0, 40461)	0.10661638198189528
  (0, 48995)	0.13385947364282724
  (0, 37105)	0.08212426198550446
  (0, 57543)	0.09575166696628208
  (0, 47121)	0.13872909241374187
  (0, 43657)	0.1295341581436704
  (0, 42700)	0.05952658602208312
  (0, 34959)	0.07296746476751245
  (0, 53076)	0.03721261085439244
  (0, 54468)	0.05298190791851636
  (0, 1698)	0.05429404495100308
  (0, 19497)	0.0

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model = MultinomialNB()
model.fit(train_count, train_raw_ratings.astype(int))

val_predictions = model.predict(val_count)

print("Test Set Evaluation:")
print("Accuracy:", accuracy_score(val_raw_ratings.astype(int), val_predictions))
print("\nClassification Report:")
print(classification_report(val_raw_ratings.astype(int), val_predictions))

Test Set Evaluation:
Accuracy: 0.69

Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       116
           0       0.00      0.00      0.00        39
           1       0.69      1.00      0.82       345

    accuracy                           0.69       500
   macro avg       0.23      0.33      0.27       500
weighted avg       0.48      0.69      0.56       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 # Now I use BERT to classify the reviews. This is different to the previous approach as it is not feature based and hence there is no need to find any PoS/Unigrams/Bigrams etc

In [5]:
import os
print(os.environ.get("CUDA_VISIBLE_DEVICES"))

None


In [6]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version: {torch.version.cuda}")

PyTorch Version: 2.7.0.dev20250117+cu126
CUDA Available: True
CUDA Version: 12.6


In [4]:
import pandas as pd

train_raw_text = np.array(train_raw_text).reshape(-1, 1)
train_raw_ratings = np.array(train_raw_ratings+1).reshape(-1, 1)

test_raw_text = np.array(test_raw_text).reshape(-1, 1)
test_raw_ratings = np.array(test_raw_ratings+1).reshape(-1, 1)

val_raw_text = np.array(val_raw_text).reshape(-1, 1)
val_raw_ratings = np.array(val_raw_ratings+1).reshape(-1, 1)

train_concat_raw = np.hstack((train_raw_text, train_raw_ratings))
test_concat_raw = np.hstack((test_raw_text, test_raw_ratings))
val_concat_raw = np.hstack((val_raw_text, val_raw_ratings))

test_df = pd.DataFrame(test_concat_raw,columns=["text","labels"])
train_df = pd.DataFrame(train_concat_raw,columns=["text","labels"])
val_df = pd.DataFrame(val_concat_raw,columns=["text","labels"])

train_dict = train_df.to_dict(orient="list")
test_dict = test_df.to_dict(orient="list")
validation_dict = val_df.to_dict(orient="list")


In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,  TrainingArguments, Trainer
import os
import gc
import torch
from datasets import Dataset

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

gc.collect()

torch.cuda.empty_cache()

train_dataset = Dataset.from_dict(train_dict)
test_dataset = Dataset.from_dict(test_dict)
validation_dataset = Dataset.from_dict(validation_dict)


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 3)
model = model.to("cuda")

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length")

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)


train_dataset = train_dataset.remove_columns([ "text", "token_type_ids"])
test_dataset = test_dataset.remove_columns([ "text", "token_type_ids"])
validation_dataset = validation_dataset.remove_columns([ "text", "token_type_ids"])

print(train_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=3,
    learning_rate=7e-6,
    use_cpu=False,
    seed = 50)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,


)
for batch in trainer.get_train_dataloader():
    print(f"Batch input_ids shape: {batch['input_ids'].shape}")
    print(f"Batch attention_mask shape: {batch['attention_mask'].shape}")
    print(f"Batch labels shape: {batch['labels'].shape}")
    break
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 4000/4000 [00:00<00:00, 6484.22 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6195.26 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6657.92 examples/s]


['labels', 'input_ids', 'attention_mask']
Batch input_ids shape: torch.Size([8, 512])
Batch attention_mask shape: torch.Size([8, 512])
Batch labels shape: torch.Size([8])


Epoch,Training Loss,Validation Loss
1,0.6898,0.517033
2,0.4849,0.512465
3,0.4022,0.531707


TrainOutput(global_step=1500, training_loss=0.5256244506835938, metrics={'train_runtime': 619.0946, 'train_samples_per_second': 19.383, 'train_steps_per_second': 2.423, 'total_flos': 3157361012736000.0, 'train_loss': 0.5256244506835938, 'epoch': 3.0})

In [1]:
tokenizer.save_pretrained("./trained_model")
trainer.save_model("./trained_model")

NameError: name 'tokenizer' is not defined

In [7]:
from sklearn.metrics import classification_report

predictions = trainer.predict(test_dataset)
logits = predictions.predictions
labels = predictions.label_ids
predicted_classes = np.argmax(logits, axis=-1)


print(classification_report(labels,predicted_classes))

              precision    recall  f1-score   support

           0       0.73      0.78      0.76       129
           1       1.00      0.03      0.07        29
           2       0.88      0.92      0.90       342

    accuracy                           0.84       500
   macro avg       0.87      0.58      0.57       500
weighted avg       0.85      0.84      0.81       500

