## Data Loading

In [2]:
import pandas as pd     # for data preprocessing
from sklearn import metrics    # for calculating accuracy

# To show all the rows of pandas dataframe
pd.set_option("display.max_rows", None)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/MyDrive/cl project files/cl colab dataset/drugsComTrain_raw.tsv", sep="\t") # read dataset

In [5]:
df_test = pd.read_csv("/content/drive/MyDrive/cl project files/cl colab dataset/drugsComTest_raw.tsv", sep="\t") # read dataset

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [7]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [8]:
df.head()["review"].iloc[0]

'"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"'

In [9]:
num_rows = df.shape[0]
print(num_rows)
print(df.shape)

num_rows = df_test.shape[0]
print(num_rows)
print(df_test.shape)

161297
(161297, 7)
53766
(53766, 7)


In [10]:
df.condition.value_counts()[:40]

Unnamed: 0_level_0,count
condition,Unnamed: 1_level_1
Birth Control,28788
Depression,9069
Pain,6145
Anxiety,5904
Acne,5588
Bipolar Disorde,4224
Insomnia,3673
Weight Loss,3609
Obesity,3568
ADHD,3383


In [11]:
medical_conditions = ['Birth Control',
 'Depression',
 'Pain',
 'Anxiety',
 'Acne',
 'Bipolar Disorde',
 'Insomnia',
 'Weight Loss',
 'Obesity',
 'ADHD',
 'Diabetes, Type 2',
 'Emergency Contraception',
 'High Blood Pressure',
 'Vaginal Yeast Infection',
 'Abnormal Uterine Bleeding',
 'Bowel Preparation',
 'ibromyalgia',
 'Smoking Cessation',
 'Migraine',
 'Anxiety and Stress',
'Major Depressive Disorde',
'Constipation',
'Panic Disorde',
'Chronic Pain',
'Migraine Prevention',
'Urinary Tract Infection',
'Muscle Spasm',
'Osteoarthritis',
'Generalized Anxiety Disorde',
'Erectile Dysfunction',
'Opiate Dependence',
'Irritable Bowel Syndrome',
'Rheumatoid Arthritis',
'Allergic Rhinitis',
'Bacterial Infection',
'Cough',
'Sinusitis',
'Nausea/Vomiting',
'GERD',
'Multiple Sclerosis']

In [12]:
df_train = df[df["condition"].isin(medical_conditions)]
# df_train = df[(df["condition"]=='Birth Control') | (df["condition"]=='Depression') | (df["condition"]=='Pain') | (df["condition"]=='High Blood Pressure')]

In [13]:
df_test_data = df_test[df_test["condition"].isin(medical_conditions)]

In [14]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1.0,"March 7, 2017",5


In [15]:
df_test_data.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4
6,215892,Copper,Birth Control,"""I&#039;ve had the copper coil for about 3 mon...",6.0,"June 6, 2016",1


In [16]:
df_train.shape

(116963, 7)

In [17]:
df_test_data.shape

(39117, 7)

In [18]:
Uncleaned_Dataset_test = df_test_data.drop(['Unnamed: 0','drugName', 'rating', 'date', 'usefulCount'], axis=1) # keep only medical condition and review column

In [19]:
Uncleaned_Dataset = df_train.drop(['Unnamed: 0','drugName', 'rating', 'date', 'usefulCount'], axis=1) # keep only medical condition and review column

In [20]:
Uncleaned_Dataset.head()

Unnamed: 0,condition,review
1,ADHD,"""My son is halfway through his fourth week of ..."
2,Birth Control,"""I used to take another oral contraceptive, wh..."
3,Birth Control,"""This is my first time using any form of birth..."
4,Opiate Dependence,"""Suboxone has completely turned my life around..."
6,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t..."


In [21]:
Uncleaned_Dataset_test.head()

Unnamed: 0,condition,review
0,Depression,"""I&#039;ve tried a few antidepressants over th..."
2,Urinary Tract Infection,"""Quick reduction of symptoms"""
3,Weight Loss,"""Contrave combines drugs that were used for al..."
4,Birth Control,"""I have been on this birth control for one cyc..."
6,Birth Control,"""I&#039;ve had the copper coil for about 3 mon..."


In [22]:
Uncleaned_Dataset["review"][2] # checking a review sample to get an idea about the word composition

'"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."'

In [23]:
Uncleaned_Dataset["review"][11]

'"I have taken anti-depressants for years, with some improvement but mostly moderate to severe side affects, which makes me go off them.\r\n\r\nI only take Cymbalta now mostly for pain.\r\n\r\nWhen I began Deplin, I noticed a major improvement overnight. More energy, better disposition, and no sinking to the low lows of major depression. I have been taking it for about 3 months now and feel like a normal person for the first time ever. Best thing, no side effects."'

In [24]:

for i, col in enumerate(Uncleaned_Dataset.columns): # removes ' " ' from all the reviews
  Uncleaned_Dataset.iloc[:,i]=Uncleaned_Dataset.iloc[:,i].str.replace('"','')

Removing stopwords

In [25]:
for i, col in enumerate(Uncleaned_Dataset_test.columns): # removes ' " ' from all the reviews
  Uncleaned_Dataset_test.iloc[:,i]=Uncleaned_Dataset_test.iloc[:,i].str.replace('"','')

In [26]:
import nltk
nltk.download('stopwords') # downloads dataset of stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
from nltk.corpus import stopwords
stop = stopwords.words("english") # collecting all the stop words in englsih language.

Lemmetization

In [28]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
porter = PorterStemmer() # stemmer object
lemmatizer = WordNetLemmatizer() # lemmetizer object


In [29]:
print(porter.stem("sportingly")) # example showing how stemming works

sportingli


In [30]:
nltk.download('wordnet')
print(lemmatizer.lemmatize("sportingly")) # example showing how lemmatization works

[nltk_data] Downloading package wordnet to /root/nltk_data...


sportingly


In [31]:
# import packages for further data cleaning
from bs4 import BeautifulSoup
import re

In [32]:
# function for applying on review column to get clean reviews
def review_to_words(raw_review):
  # 1 delete HTML
  review_text = BeautifulSoup(raw_review, "html.parser").get_text()
  # 2 Make a space
  letters_only = re.sub("[^a-zA-Z]", " ", review_text)
  # 2 Make a space
  words = letters_only.lower().split()
  # 4 Stopwords
  meaningful_words = [w for w in words if not (w in stop)]
  # 6 Lemmitization
  lemmitize_words = [lemmatizer.lemmatize(w) for w in meaningful_words]
  # 7 space join words
  return(' '.join(lemmitize_words))

In [33]:
Cleaned_Dataset = Uncleaned_Dataset.copy()  # Create a copy of the original dataset
Cleaned_Dataset["review_clean"] = Cleaned_Dataset["review"].apply(review_to_words)  # Apply the function on the copied dataset
Cleaned_Dataset = Cleaned_Dataset.drop(["review"], axis=1) # remove the review column

  review_text = BeautifulSoup(raw_review, "html.parser").get_text()


In [34]:
Cleaned_Dataset_test = Uncleaned_Dataset_test.copy()  # Create a copy of the original dataset
Cleaned_Dataset_test["review_clean"] = Cleaned_Dataset_test["review"].apply(review_to_words)  # Apply the function on the copied dataset
Cleaned_Dataset_test = Cleaned_Dataset_test.drop(["review"], axis=1) # remove the review column

  review_text = BeautifulSoup(raw_review, "html.parser").get_text()


In [35]:
Uncleaned_Dataset.head()

Unnamed: 0,condition,review
1,ADHD,My son is halfway through his fourth week of I...
2,Birth Control,"I used to take another oral contraceptive, whi..."
3,Birth Control,This is my first time using any form of birth ...
4,Opiate Dependence,Suboxone has completely turned my life around....
6,Emergency Contraception,"He pulled out, but he cummed a bit in me. I to..."


In [36]:
Cleaned_Dataset.head()

Unnamed: 0,condition,review_clean
1,ADHD,son halfway fourth week intuniv became concern...
2,Birth Control,used take another oral contraceptive pill cycl...
3,Birth Control,first time using form birth control glad went ...
4,Opiate Dependence,suboxone completely turned life around feel he...
6,Emergency Contraception,pulled cummed bit took plan b hour later took ...


In [37]:
Cleaned_Dataset_test.head()

Unnamed: 0,condition,review_clean
0,Depression,tried antidepressant year citalopram fluoxetin...
2,Urinary Tract Infection,quick reduction symptom
3,Weight Loss,contrave combine drug used alcohol smoking opi...
4,Birth Control,birth control one cycle reading review type si...
6,Birth Control,copper coil month really excited thought takin...


In [38]:
Cleaned_Dataset['review_clean'][1] # example showing how cleaned reviews look like.

'son halfway fourth week intuniv became concerned began last week started taking highest dose two day could hardly get bed cranky slept nearly hour drive home school vacation unusual called doctor monday morning said stick day see school getting morning last two day problem free much agreeable ever le emotional good thing le cranky remembering thing overall behavior better tried many different medication far effective'

In [39]:
X_feat = Cleaned_Dataset['review_clean']
y = Cleaned_Dataset['condition']
# dividing dataset in 80:20 to training:testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X_feat, y, stratify = y, test_size = 0.0, random_state=0)
X_train_Cleaned = X_feat
y_train_Cleaned = y

In [40]:
X_feat_test = Cleaned_Dataset_test['review_clean']
y_test = Cleaned_Dataset_test['condition']
X_test_Cleaned = X_feat_test
y_test_Cleaned = y_test

In [41]:
# Code for evaluating a model on cleaned data
def Evaluate_Cleaned(model, X_train, X_test):
  model.fit(X_train, y_train_Cleaned)
  pred = model.predict(X_test)
  score = metrics.accuracy_score(y_test_Cleaned, pred)
  return score

# Bag of Words : Cleaned

## Uni-Gram

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words = 'english')
count_train_uni = count_vectorizer.fit_transform(X_train_Cleaned)
count_test_uni = count_vectorizer.transform(X_test_Cleaned)

### Machine Learning Model : Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
score = Evaluate_Cleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)


accuracy: 0.825


### Machine Learning Model:Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
score = Evaluate_Cleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.756


### Machine Learning Model : Passive Aggressive Classifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier()
score = Evaluate_Cleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.805


### Machine Learning Model : Support Vector Machine

In [None]:
from sklearn.svm import SVC
model = SVC()
score = Evaluate_Cleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.816


### Machine Learning Model : Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
score = Evaluate_Cleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.875


## Bi-Gram

In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [45]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1, 2))
count_train_bi = count_vectorizer.fit_transform(X_train_Cleaned)
count_test_bi = count_vectorizer.transform(X_test_Cleaned)

### Machine Learning Model : Decision Tree

In [None]:
model = DecisionTreeClassifier()
score = Evaluate_Cleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.831


### Machine Learning Model:Naive Bayes

In [None]:
model = MultinomialNB()
score = Evaluate_Cleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.663


### Machine Learning Model : Passive Aggressive Classifier

In [None]:
model = PassiveAggressiveClassifier()
score = Evaluate_Cleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.879


### Machine Learning Model : Support Vector Machine

In [48]:
model = SVC()
score = Evaluate_Cleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.827


### Machine Learning Model : Random Forest

In [46]:
model = RandomForestClassifier()
score = Evaluate_Cleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.866


# TF-IDF : Cleaned

## Uni-Gram

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.8)
tfidf_train_uni = tfidf_vectorizer.fit_transform(X_train_Cleaned)
tfidf_test_uni = tfidf_vectorizer.transform(X_test_Cleaned)

### Machine Learning Model : Decision Tree

In [48]:
model = DecisionTreeClassifier()
score = Evaluate_Cleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.822


### Machine Learning Model : Naive Bayes

In [49]:
model = MultinomialNB()
score = Evaluate_Cleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.533


### Machine Learing Model: Passive Aggressive Model

In [50]:
model = PassiveAggressiveClassifier()
score = Evaluate_Cleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.843


### Machine Learning Model : Support Vector Machine

In [51]:
model = SVC()
score = Evaluate_Cleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.866


### Machine Learning Model : Random Forest

In [52]:
model = RandomForestClassifier()
score = Evaluate_Cleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.875


## Bi-gram

In [53]:
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english', max_df = 0.8, ngram_range=(1,2))
tfidf_train_bi = tfidf_vectorizer2.fit_transform(X_train_Cleaned)
tfidf_test_bi = tfidf_vectorizer2.transform(X_test_Cleaned)

### Machine Learning Model : Decision Tree

In [54]:
model = DecisionTreeClassifier()
score = Evaluate_Cleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.822


### Machine Learning Model : Naive Bayes

In [55]:
model = MultinomialNB()
score = Evaluate_Cleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.352


### Machine Learing Model: Passive Aggressive Model

In [56]:
model = PassiveAggressiveClassifier()
score = Evaluate_Cleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.898


### Machine Learning Model : Support Vector Machine

In [2]:
model = SVC()
score = Evaluate_Cleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.876


### Machine Learning Model : Random Forest

In [3]:
model = RandomForestClassifier()
score = Evaluate_Cleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.862
