## Data Loading

In [1]:
import pandas as pd     # for data preprocessing
from sklearn import metrics    # for calculating accuracy

# To show all the rows of pandas dataframe
pd.set_option("display.max_rows", None)

In [2]:
df = pd.read_csv("/kaggle/input/medical-feedback/drugsComTrain_raw.tsv", sep="\t") # read dataset

In [3]:
df_test = pd.read_csv("/kaggle/input/medical-feedback/drugsComTest_raw.tsv", sep="\t") # read dataset

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [5]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [6]:
df.head()["review"].iloc[0]

'"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"'

In [7]:
num_rows = df.shape[0]
print(num_rows)
print(df.shape)

num_rows = df_test.shape[0]
print(num_rows)
print(df_test.shape)

161297
(161297, 7)
53766
(53766, 7)


In [8]:
df.condition.value_counts()[:40]

condition
Birth Control                  28788
Depression                      9069
Pain                            6145
Anxiety                         5904
Acne                            5588
Bipolar Disorde                 4224
Insomnia                        3673
Weight Loss                     3609
Obesity                         3568
ADHD                            3383
Diabetes, Type 2                2554
Emergency Contraception         2463
High Blood Pressure             2321
Vaginal Yeast Infection         2274
Abnormal Uterine Bleeding       2096
Bowel Preparation               1859
ibromyalgia                     1791
Smoking Cessation               1780
Migraine                        1694
Anxiety and Stress              1663
Major Depressive Disorde        1607
Constipation                    1595
Panic Disorde                   1463
Chronic Pain                    1455
Migraine Prevention             1413
Urinary Tract Infection         1316
Muscle Spasm                

In [9]:
medical_conditions = ['Birth Control',
 'Depression',
 'Pain',
 'Anxiety',
 'Acne',
 'Bipolar Disorde',
 'Insomnia',
 'Weight Loss',
 'Obesity',
 'ADHD',
 'Diabetes, Type 2',
 'Emergency Contraception',
 'High Blood Pressure',
 'Vaginal Yeast Infection',
 'Abnormal Uterine Bleeding',
 'Bowel Preparation',
 'ibromyalgia',
 'Smoking Cessation',
 'Migraine',
 'Anxiety and Stress',
'Major Depressive Disorde',
'Constipation',
'Panic Disorde',
'Chronic Pain',
'Migraine Prevention',
'Urinary Tract Infection',
'Muscle Spasm',
'Osteoarthritis',
'Generalized Anxiety Disorde',
'Erectile Dysfunction',
'Opiate Dependence',
'Irritable Bowel Syndrome',
'Rheumatoid Arthritis',
'Allergic Rhinitis',
'Bacterial Infection',
'Cough',
'Sinusitis',
'Nausea/Vomiting',
'GERD',
'Multiple Sclerosis']

In [10]:
df_train = df[df["condition"].isin(medical_conditions)]

In [11]:
df_test_data = df_test[df_test["condition"].isin(medical_conditions)]

In [12]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1.0,"March 7, 2017",5


In [13]:
df_test_data.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4
6,215892,Copper,Birth Control,"""I&#039;ve had the copper coil for about 3 mon...",6.0,"June 6, 2016",1


In [14]:
df_train.shape

(116963, 7)

In [15]:
df_test_data.shape

(39117, 7)

In [16]:
Uncleaned_Dataset_test = df_test_data.drop(['Unnamed: 0','drugName', 'rating', 'date', 'usefulCount'], axis=1) # keep only medical condition and review column

In [17]:
Uncleaned_Dataset = df_train.drop(['Unnamed: 0','drugName', 'rating', 'date', 'usefulCount'], axis=1) # keep only medical condition and review column

In [18]:
Uncleaned_Dataset.head()

Unnamed: 0,condition,review
1,ADHD,"""My son is halfway through his fourth week of ..."
2,Birth Control,"""I used to take another oral contraceptive, wh..."
3,Birth Control,"""This is my first time using any form of birth..."
4,Opiate Dependence,"""Suboxone has completely turned my life around..."
6,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t..."


In [19]:
Uncleaned_Dataset_test.head()

Unnamed: 0,condition,review
0,Depression,"""I&#039;ve tried a few antidepressants over th..."
2,Urinary Tract Infection,"""Quick reduction of symptoms"""
3,Weight Loss,"""Contrave combines drugs that were used for al..."
4,Birth Control,"""I have been on this birth control for one cyc..."
6,Birth Control,"""I&#039;ve had the copper coil for about 3 mon..."


In [20]:
Uncleaned_Dataset["review"][2] # checking a review sample to get an idea about the word composition

'"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."'

In [21]:
Uncleaned_Dataset["review"][11]

'"I have taken anti-depressants for years, with some improvement but mostly moderate to severe side affects, which makes me go off them.\r\n\r\nI only take Cymbalta now mostly for pain.\r\n\r\nWhen I began Deplin, I noticed a major improvement overnight. More energy, better disposition, and no sinking to the low lows of major depression. I have been taking it for about 3 months now and feel like a normal person for the first time ever. Best thing, no side effects."'

In [22]:
for i, col in enumerate(Uncleaned_Dataset.columns): # removes ' " ' from all the reviews
  Uncleaned_Dataset.iloc[:,i]=Uncleaned_Dataset.iloc[:,i].str.replace('"','')

Removing stopwords

In [23]:
for i, col in enumerate(Uncleaned_Dataset_test.columns): # removes ' " ' from all the reviews
  Uncleaned_Dataset_test.iloc[:,i]=Uncleaned_Dataset_test.iloc[:,i].str.replace('"','')

In [24]:
Uncleaned_Dataset.head()

Unnamed: 0,condition,review
1,ADHD,My son is halfway through his fourth week of I...
2,Birth Control,"I used to take another oral contraceptive, whi..."
3,Birth Control,This is my first time using any form of birth ...
4,Opiate Dependence,Suboxone has completely turned my life around....
6,Emergency Contraception,"He pulled out, but he cummed a bit in me. I to..."


In [25]:
Uncleaned_Dataset_test.head()

Unnamed: 0,condition,review
0,Depression,I&#039;ve tried a few antidepressants over the...
2,Urinary Tract Infection,Quick reduction of symptoms
3,Weight Loss,Contrave combines drugs that were used for alc...
4,Birth Control,I have been on this birth control for one cycl...
6,Birth Control,I&#039;ve had the copper coil for about 3 mont...


In [26]:
X_train_Uncleaned = Uncleaned_Dataset['review']
y_train_Uncleaned = Uncleaned_Dataset['condition']

In [27]:
X_test_Uncleaned = Uncleaned_Dataset['review']
y_test_Uncleaned = Uncleaned_Dataset['condition']

## Creating features and Target Variables

In [28]:
# Code for evaluating a model on uncleaned data
def Evaluate_Uncleaned(model, X_train, X_test):
  model.fit(X_train, y_train_Uncleaned)
  pred = model.predict(X_test)
  score = metrics.accuracy_score(y_test_Uncleaned, pred)
  return score

# Bag of Words : Uncleaned

## Uni-Gram

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words = 'english')
count_train_uni = count_vectorizer.fit_transform(X_train_Uncleaned)
count_test_uni = count_vectorizer.transform(X_test_Uncleaned)

### Machine Learning Model : Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
score = Evaluate_Uncleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)


accuracy: 0.997


### Machine Learning Model:Naive Bayes

In [31]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
score = Evaluate_Uncleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.788


### Machine Learning Model : Passive Aggressive Classifier

In [32]:
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier()
score = Evaluate_Uncleaned(model, count_train_uni, count_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.928


## Bi-Gram

In [33]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1, 2))
count_train_bi = count_vectorizer.fit_transform(X_train_Uncleaned)
count_test_bi = count_vectorizer.transform(X_test_Uncleaned)

### Machine Learning Model : Decision Tree

In [34]:
model = DecisionTreeClassifier()
score = Evaluate_Uncleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.997


### Machine Learning Model:Naive Bayes

In [35]:
model = MultinomialNB()
score = Evaluate_Uncleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.757


### Machine Learning Model : Passive Aggressive Classifier

In [36]:
model = PassiveAggressiveClassifier()
score = Evaluate_Uncleaned(model, count_train_bi, count_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.996


# TF-IDF : Uncleaned

## Uni-Gram

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.8)
tfidf_train_uni = tfidf_vectorizer.fit_transform(X_train_Uncleaned)
tfidf_test_uni = tfidf_vectorizer.transform(X_test_Uncleaned)

### Machine Learning Model : Decision Tree

In [38]:
model = DecisionTreeClassifier()
score = Evaluate_Uncleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.997


### Machine Learning Model : Naive Bayes

In [39]:
model = MultinomialNB()
score = Evaluate_Uncleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.532


### Machine Learing Model: Passive Aggressive Model

In [40]:
model = PassiveAggressiveClassifier()
score = Evaluate_Uncleaned(model, tfidf_train_uni, tfidf_test_uni)
print("accuracy: %0.3f" % score)

accuracy: 0.972


## Bi-gram

In [41]:
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english', max_df = 0.8, ngram_range=(1,2))
tfidf_train_bi = tfidf_vectorizer2.fit_transform(X_train_Uncleaned)
tfidf_test_bi = tfidf_vectorizer2.transform(X_test_Uncleaned)

### Machine Learning Model : Decision Tree

In [42]:
model = DecisionTreeClassifier()
score = Evaluate_Uncleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.997


### Machine Learning Model : Naive Bayes

In [43]:
model = MultinomialNB()
score = Evaluate_Uncleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.357


### Machine Learing Model: Passive Aggressive Model

In [44]:
model = PassiveAggressiveClassifier()
score = Evaluate_Uncleaned(model, tfidf_train_bi, tfidf_test_bi)
print("accuracy: %0.3f" % score)

accuracy: 0.997
