Using the drug-review dataset found at https://www.kaggle.com/datasets/mohamedabdelwahabali/drugreview?resource=download

In [23]:
import numpy as np
import pandas as pd


train_raw = pd.read_csv("archive/drug_review_train.csv", usecols=["review", "rating"]).to_numpy()
test_raw = pd.read_csv("archive/drug_review_test.csv", usecols=["review", "rating"]).to_numpy()
val_raw = pd.read_csv("archive/drug_review_validation.csv", usecols=["review", "rating"]).to_numpy()


print(train_raw.shape)
print(test_raw.shape)
print(val_raw.shape)

(110811, 2)
(46108, 2)
(27703, 2)


In [29]:
def seperate_ratings_and_text(data):
    text = data[:,0]
    ratings = data[:,1]
    return text,ratings


train_raw_text,train_raw_ratings = seperate_ratings_and_text(train_raw)
test_raw_text,test_raw_ratings = seperate_ratings_and_text(test_raw)
val_raw_text,val_raw_ratings = seperate_ratings_and_text(val_raw)

print(train_raw_text[0:5])
print(train_raw_ratings[0:5])

['"i have used restasis for about a year now and have seen almost no progress.  for most of my life i\'ve had red and bothersome eyes. after trying various eye drops, my doctor recommended restasis.  he said it typically takes 3 to 6 months for it to really kick in but it never did kick in.  when i put the drops in it burns my eyes for the first 30 - 40 minutes.  i\'ve talked with my doctor about this and he said it is normal but should go away after some time, but it hasn\'t. every year around spring time my eyes get terrible irritated  and this year has been the same (maybe even worse than other years) even though i\'ve been using restasis for a year now. the only difference i notice was for the first couple weeks, but now i\'m ready to move on."'
 '"my experience has been somewhat mixed. i have been using implanon now for nearly 14 months and have decided to get it removed because i bleed every day, all day. i would occasionally stain my underwear and my sheets. it didn\'t start out

In [38]:
import re

def clean_review(review):
    review = re.sub(r"[^a-zA-Z0-9\s]", "", review)
    review = re.sub(r"\s+", " ", review) 
    review = review.lower()
    return review

def clean_text(text_list):
    cleaned_sentences = [clean_review(sentence) for sentence in text_list]
    return np.array(cleaned_sentences)



train_cleaned = clean_text(train_raw_text)
test_cleaned = clean_text(test_raw_text)
val_cleaned = clean_text(val_raw_text)

print(train_cleaned[0:5])

['i have used restasis for about a year now and have seen almost no progress for most of my life ive had red and bothersome eyes after trying various eye drops my doctor recommended restasis he said it typically takes 3 to 6 months for it to really kick in but it never did kick in when i put the drops in it burns my eyes for the first 30 40 minutes ive talked with my doctor about this and he said it is normal but should go away after some time but it hasnt every year around spring time my eyes get terrible irritated and this year has been the same maybe even worse than other years even though ive been using restasis for a year now the only difference i notice was for the first couple weeks but now im ready to move on'
 'my experience has been somewhat mixed i have been using implanon now for nearly 14 months and have decided to get it removed because i bleed every day all day i would occasionally stain my underwear and my sheets it didnt start out that way for the first month i didnt b

In [49]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    words = sentence.split(" ")

    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return lemmatized_words

def lemmatize(data):

    result = [lemmatize_sentence(sentence) for sentence in data]
    return result                                                               #cant be np array as varying length

train_lem = lemmatize(train_cleaned)
test_lem = lemmatize(test_cleaned)
val_lem = lemmatize(val_cleaned)


In [52]:
print(train_lem[0:2])
print(len(train_lem))

[['i', 'have', 'used', 'restasis', 'for', 'about', 'a', 'year', 'now', 'and', 'have', 'seen', 'almost', 'no', 'progress', 'for', 'most', 'of', 'my', 'life', 'ive', 'had', 'red', 'and', 'bothersome', 'eye', 'after', 'trying', 'various', 'eye', 'drop', 'my', 'doctor', 'recommended', 'restasis', 'he', 'said', 'it', 'typically', 'take', '3', 'to', '6', 'month', 'for', 'it', 'to', 'really', 'kick', 'in', 'but', 'it', 'never', 'did', 'kick', 'in', 'when', 'i', 'put', 'the', 'drop', 'in', 'it', 'burn', 'my', 'eye', 'for', 'the', 'first', '30', '40', 'minute', 'ive', 'talked', 'with', 'my', 'doctor', 'about', 'this', 'and', 'he', 'said', 'it', 'is', 'normal', 'but', 'should', 'go', 'away', 'after', 'some', 'time', 'but', 'it', 'hasnt', 'every', 'year', 'around', 'spring', 'time', 'my', 'eye', 'get', 'terrible', 'irritated', 'and', 'this', 'year', 'ha', 'been', 'the', 'same', 'maybe', 'even', 'worse', 'than', 'other', 'year', 'even', 'though', 'ive', 'been', 'using', 'restasis', 'for', 'a', 'ye