In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

data from: https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29

In [2]:
drugs = pd.read_csv('data/drugsComTrain_raw.tsv', delimiter='\t')

drugs.date = pd.to_datetime(drugs.date)
drugs = drugs.rename({'Unnamed: 0':'id'}, axis=1)

drugs.head()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,2010-04-27,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,2016-11-27,37


In [3]:
drugs.shape

(161297, 7)

In [4]:
drugs.drugName.value_counts()

Levonorgestrel                                     3657
Etonogestrel                                       3336
Ethinyl estradiol / norethindrone                  2850
Nexplanon                                          2156
Ethinyl estradiol / norgestimate                   2117
Ethinyl estradiol / levonorgestrel                 1888
Phentermine                                        1543
Sertraline                                         1360
Escitalopram                                       1292
Mirena                                             1242
Implanon                                           1102
Gabapentin                                         1047
Bupropion                                          1022
Venlafaxine                                        1016
Miconazole                                         1000
Citalopram                                          995
Medroxyprogesterone                                 995
Lexapro                                         

In [5]:
drugs.rating.value_counts()

10.0    50989
9.0     27531
1.0     21619
8.0     18890
7.0      9456
5.0      8013
2.0      6931
3.0      6513
6.0      6343
4.0      5012
Name: rating, dtype: int64

Most of the ratings are either a 8, 9 or 10 star. About 1/8 are 1 star.

In [6]:
drugs[['drugName','condition','rating','review']].loc[drugs.rating == 10]

Unnamed: 0,drugName,condition,rating,review
7,Aripiprazole,Bipolar Disorde,10.0,"""Abilify changed my life. There is hope. I was..."
11,L-methylfolate,Depression,10.0,"""I have taken anti-depressants for years, with..."
18,Lamotrigine,Bipolar Disorde,10.0,"""I&#039;ve been on every medicine under the su..."
19,Nilotinib,Chronic Myelogenous Leukemia,10.0,"""I have been on Tasigna for just over 3 years ..."
21,Trazodone,Insomnia,10.0,"""I have insomnia, it&#039;s horrible. My story..."
23,Etanercept,Rheumatoid Arthritis,10.0,"""I live in Western Australia and disturbed by ..."
26,Eflornithine,Hirsutism,10.0,"""I&#039;m writing a second review on Vaniqa. ..."
27,Daytrana,ADHD,10.0,"""Hi all, My son who is 12 was diagnosed when h..."
30,Azithromycin,,10.0,"""Very good response. It is so useful for me. """
32,Toradol,Pain,10.0,"""I am 30 years old. I had a multiple composite..."


In [7]:
drugs[['drugName','condition','rating','review']].loc[drugs.condition.isnull()]

Unnamed: 0,drugName,condition,rating,review
30,Azithromycin,,10.0,"""Very good response. It is so useful for me. """
148,Urea,,10.0,"""Accurate information."""
488,Doxepin,,10.0,"""So far so good. Good for me and I can take it..."
733,Ethinyl estradiol / norgestimate,,8.0,"""I haven&#039;t been on it for a long time and..."
851,Medroxyprogesterone,,6.0,"""I started the shot in July 2015 and ended in ..."
1014,Acetaminophen / caffeine,,10.0,"""I get migraine and have found out by taking e..."
1124,Tavaborole,,10.0,"""I have struggled with nail for 8 or ten years..."
1163,Acetaminophen / butalbital / caffeine / codeine,,5.0,"""I found that while this medicine does relieve..."
1253,Ethinyl estradiol / norethindrone,,4.0,"""I started Loestrin and within two months I ex..."
1267,Conjugated estrogens,,10.0,"""I had to have a total hysterectomy in 2009 in..."


In [8]:
drugs['drugName'].loc[drugs.drugName.str.contains('/')].value_counts()

Ethinyl estradiol / norethindrone                          2850
Ethinyl estradiol / norgestimate                           2117
Ethinyl estradiol / levonorgestrel                         1888
Bupropion / naltrexone                                      950
Drospirenone / ethinyl estradiol                            890
Ethinyl estradiol / etonogestrel                            635
Magnesium sulfate / potassium sulfate / sodium sulfate      626
Sulfamethoxazole / trimethoprim                             527
Desogestrel / ethinyl estradiol                             522
Acetaminophen / hydrocodone                                 498
Buprenorphine / naloxone                                    437
Amphetamine / dextroamphetamine                             419
Adapalene / benzoyl peroxide                                399
Microgestin Fe 1 / 20                                       392
Ethinyl estradiol / norelgestromin                          374
Acetaminophen / oxycodone               

In [9]:
# Check the null values and put into dataframe for readability. 
nullvals = pd.DataFrame(drugs.isna().sum())
nullvals

Unnamed: 0,0
id,0
drugName,0
condition,899
review,0
rating,0
date,0
usefulCount,0


In [10]:
drugs = drugs.dropna()

In [11]:
drugs['condition'] = drugs['condition'].apply(lambda text: re.sub("(Disorde)", "Disorder", text))

# pull out the values that contain a span tag
span_values = pd.DataFrame(drugs['drugName'].loc[drugs.condition.str.contains('</span>')].value_counts())
span_values['name'] = span_values.index

# replace the spans with the most common condition for it. 
for name in span_values['name']:
    drugs['condition'].loc[(drugs.drugName==name)&
                           (drugs.condition.str.contains('</span>'))] = drugs['condition'].loc[drugs.drugName == name].value_counts().idxmax()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
# these should probably be the drugs where there only value contains the span tag
drugs['drugName'].loc[drugs.condition.str.contains('</span>')].value_counts()

Taztia XT                              1
Pepcid AC Chewable Tablets             1
Nor-QD                                 1
Fleet Phospho Soda                     1
Uroqid-Acid No2                        1
Fluoridex                              1
Aerobid-M                              1
Dantrium                               1
Hair Regrowth Treatment for Women      1
Lescol                                 1
Fragmin                                1
Alavert D-12 Hour Allergy and Sinus    1
Nutropin                               1
Lotrimin AF Athlete's Foot Powder      1
Nystop                                 1
Orapred                                1
Spectracef                             1
Blephamide                             1
Rogaine Men's Extra Strength           1
Maxidex                                1
Regimex                                1
Gadavist                               1
Fluzone                                1
Name: drugName, dtype: int64

In [13]:
drugs = drugs.loc[drugs.condition != drugs.condition.str.contains('</span>')]

In [14]:
drugs

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,2010-04-27,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,2016-11-27,37
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,2015-11-28,43
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1.0,2017-03-07,5
7,102654,Aripiprazole,Bipolar Disorder,"""Abilify changed my life. There is hope. I was...",10.0,2015-03-14,32
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1.0,2016-08-09,11
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8.0,2016-12-08,1


In [15]:
drugs[['drugName','condition','review']].loc[(drugs.drugName=='Mirena')]

Unnamed: 0,drugName,condition,review
162,Mirena,Birth Control,"""I am 29 and just had my second Mirena inserte..."
190,Mirena,Birth Control,"""I love my Mirena. I&#039;m due in February to..."
547,Mirena,Birth Control,"""I am 22, no prior children, I have endometrio..."
656,Mirena,Abnormal Uterine Bleeding,"""Just got my IUD placed today! I was honestly ..."
742,Mirena,Birth Control,"""I have had a wonderful experience with mirena..."
1126,Mirena,Birth Control,"""I&#039;m 18 and I&#039;ve had the Mirena for ..."
1218,Mirena,Birth Control,"""I&#039;ve had the Mirena for more than a year..."
1918,Mirena,Abnormal Uterine Bleeding,"""I was put on blood thinners due to a PE and h..."
2043,Mirena,Birth Control,"""Mirena has been greaat for me. We have one ch..."
2676,Mirena,Birth Control,"""I had hair loss and lack of appetite when I h..."


In [16]:
drugs[['drugName','condition','rating','review']].loc[drugs.drugName.str.contains('Ethinyl estradiol')]

Unnamed: 0,drugName,condition,rating,review
9,Ethinyl estradiol / levonorgestrel,Birth Control,8.0,"""I had been on the pill for many years. When m..."
69,Ethinyl estradiol / norgestimate,Acne,9.0,"""Best treatment for acne I have used! I&#039;v..."
117,Ethinyl estradiol / levonorgestrel,Birth Control,9.0,"""I love Lutera. I am very sensitive to other f..."
127,Ethinyl estradiol / levonorgestrel,Birth Control,10.0,"""I&#039;ve been on Jolessa for 6 months. I de..."
154,Ethinyl estradiol / norgestimate,Birth Control,5.0,"""I wrote my expirence with this pill before, b..."
163,Ethinyl estradiol / norelgestromin,Birth Control,10.0,"""This is absolutely the best birth control I h..."
186,Ethinyl estradiol / norgestimate,Birth Control,3.0,"""I don&#039;t think I noticed this at first or..."
225,Ethinyl estradiol / norelgestromin,Birth Control,10.0,"""The first day I used it, I felt terribly sick..."
226,Ethinyl estradiol / norethindrone,Birth Control,1.0,"""This birthcontrol is terrible! I am 28 years ..."
239,Ethinyl estradiol / norethindrone,Birth Control,9.0,"""No side effects, very light periods. On my 3r..."


In [17]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub("(&#039;)", "'", text)
    text = re.sub("(&amp;)", "&", text)
    text = ' '.join(text.split())
    return text

drugs.review = drugs.review.apply(lambda x: text_cleaner(x))

In [18]:
print(drugs.review[533])

"Have been on the patch for 5 months now & it's just awful, I feel nauseous all the time & get headaches and the dizziness that comes along with it. My periods are heavy & have painful cramping, some in between bleeding"


### Balance the Data

In [19]:
drugs['positive'] = np.where(drugs['rating']>7,1,0)

In [20]:
drugs['positive'].value_counts()

1    96915
0    63483
Name: positive, dtype: int64

In [21]:
drugs_s = drugs.sample(frac=1, random_state=40)

positive = drugs_s.loc[drugs_s.positive==1][:63483]
not_pos = drugs_s.loc[drugs_s.positive==0][:63483]

drugs = pd.concat([positive, not_pos])
drugs = drugs.sample(frac=1, random_state=40)

compare = pd.DataFrame()
compare['five'] = drugs.positive.loc[drugs.positive==1].describe()
compare['not'] = drugs.positive.loc[drugs.positive==0].describe()
compare

Unnamed: 0,five,not
count,63483.0,63483.0
mean,1.0,0.0
std,0.0,0.0
min,1.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,0.0


In [22]:
# how long it will take and hopefully speed some stuff up
from tqdm import tqdm
tqdm.pandas()

# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
# add parsed reviews into new column
drugs['review_parsed'] = drugs.review.progress_apply(nlp)

100%|██████████| 126966/126966 [42:12<00:00, 50.13it/s] 


In [23]:
nlp = spacy.load('en')

positive_doc1 = ' '.join(drugs.review.loc[drugs.positive==1][0:2500].astype(str))
positive_doc2 = ' '.join(drugs.review.loc[drugs.positive==1][2501:5000].astype(str))
positive_doc3 = ' '.join(drugs.review.loc[drugs.positive==1][5001:7500].astype(str))
positive_doc4 = ' '.join(drugs.review.loc[drugs.positive==1][7501:10000].astype(str))
nlp.max_length = 1500000
positive_doc1 = nlp(positive_doc1)
positive_doc2 = nlp(positive_doc2)
positive_doc3 = nlp(positive_doc3)
positive_doc4 = nlp(positive_doc4)

In [24]:
not_pos_doc1 = ' '.join(drugs.review.loc[drugs.positive==0][0:2500].astype(str))
not_pos_doc2 = ' '.join(drugs.review.loc[drugs.positive==0][2501:5000].astype(str))
not_pos_doc3 = ' '.join(drugs.review.loc[drugs.positive==0][5001:7500].astype(str))
not_pos_doc4 = ' '.join(drugs.review.loc[drugs.positive==0][7501:10000].astype(str))
nlp.max_length = 1500000
not_pos_doc1 = nlp(not_pos_doc1)
not_pos_doc2 = nlp(not_pos_doc2)
not_pos_doc3 = nlp(not_pos_doc3)
not_pos_doc4 = nlp(not_pos_doc4)

In [25]:
drugs.head()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,positive,review_parsed
77903,139941,Climara Pro,Postmenopausal Symptoms,"""The patch works great - no more debilitating ...",8.0,2017-11-07,0,1,"("", The, patch, works, great, -, no, more, deb..."
102796,196782,Paroxetine,Anxiety,"""Here is my experience with 10+ years of takin...",1.0,2017-05-27,20,0,"("", Here, is, my, experience, with, 10, +, yea..."
38606,13204,Vitamin e,Dietary Supplementation,"""I am highly allergic to Vitamin E. I had a fu...",1.0,2011-02-07,3,0,"("", I, am, highly, allergic, to, Vitamin, E., ..."
65280,12969,Vardenafil,Erectile Dysfunction,"""Works very well. Using 20mg it usually starts...",9.0,2013-05-05,64,1,"("", Works, very, well, ., Using, 20, mg, it, u..."
14500,214206,Tioconazole,Vaginal Yeast Infection,"""Had mild symptoms that a yeast infection was ...",8.0,2016-02-21,1,1,"("", Had, mild, symptoms, that, a, yeast, infec..."


In [67]:
from collections import Counter

# Utility function to create a list of the 200 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(300)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(drugs, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = drugs['review_parsed']
    df['rating'] = drugs['rating']
    df['positive'] = drugs['positive']
    df['date'] = drugs['date']
    df['usefulCount'] = drugs['usefulCount']
    df['drugName'] = drugs['drugName']
    df['condition'] = drugs['condition']
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 5000 == 0:
            print("Processing row {}".format(i))
            
    return df

In [63]:
common_words_pos1 = bag_of_words(positive_doc1)
common_words_pos2 = bag_of_words(positive_doc2)
common_words_pos3 = bag_of_words(positive_doc3)
common_words_pos4 = bag_of_words(positive_doc4)

common_words_neg1 = bag_of_words(not_pos_doc1)
common_words_neg2 = bag_of_words(not_pos_doc2)
common_words_neg3 = bag_of_words(not_pos_doc3)
common_words_neg4 = bag_of_words(not_pos_doc4)

common_words = (common_words_pos1+common_words_pos2+common_words_pos3+common_words_pos4+
               common_words_neg1+common_words_neg2+common_words_neg3+common_words_neg4)

common_words = set(common_words)

print(len(common_words))

402


In [65]:
print(common_words)

{'diagnose', 'break', 'painful', 'wonderful', 'mild', 'bc', 'pain', 'make', '3', 'stomach', 'call', '2015', 'pressure', 'drug', 'emotional', 'low', 'birth', 'last', 'suppose', 'love', 'month', '8', '20', 'lb', 'need', 'infection', 'crazy', 'after', 'red', 'end', 'shot', 'hope', 'sweat', 'anymore', 'recently', 'and', 'test', 'mood', 'cough', 'everyday', 'hair', 'sex', 'point', 'definitely', 'change', 'mind', 'zoloft', 'improvement', 'pill', 'reaction', 'deal', 'tell', '10', 'withdrawal', 'itching', 'treatment', 'thought', 'itch', 'iud', 'sore', 'night', 'suggest', 'loss', '6', 'highly', 'pound', 'notice', 'cramping', 'mirena', 'severe', 'constipation', 'reduce', 'new', 'pretty', 'level', 'fatigue', 'bit', 'know', 'big', 'decide', 'hard', 'get', 'dizziness', 'spot', 'happen', 'awful', 'asleep', 'depressed', 'wait', 'etc', 'generic', 'literally', 'constantly', 'wear', 'calm', 'horrible', 'take', '1', 'minute', 'symptom', 'switch', 'insomnia', 'taste', 'drop', 'worry', 'have', 'terrible', 

In [75]:
drugs = drugs.reset_index(drop=True)

In [76]:
wc = bow_features(drugs, common_words)

Processing row 0
Processing row 5000
Processing row 10000
Processing row 15000
Processing row 20000
Processing row 25000
Processing row 30000
Processing row 35000
Processing row 40000
Processing row 45000
Processing row 50000
Processing row 55000
Processing row 60000
Processing row 65000
Processing row 70000
Processing row 75000
Processing row 80000
Processing row 85000
Processing row 90000
Processing row 95000
Processing row 100000
Processing row 105000
Processing row 110000
Processing row 115000
Processing row 120000
Processing row 125000


In [87]:
wc['positive'] = np.where(wc['rating'] > 7,1,0)

In [90]:
wc.head()

Unnamed: 0,diagnose,break,painful,wonderful,mild,bc,pain,make,3,stomach,...,find,fall,anxiety,life,text_sentence,rating,date,usefulCount,drugName,condition
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,"("", The, patch, works, great, -, no, more, deb...",8.0,2017-11-07,0,Climara Pro,Postmenopausal Symptoms
1,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,"("", Here, is, my, experience, with, 10, +, yea...",1.0,2017-05-27,20,Paroxetine,Anxiety
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"("", I, am, highly, allergic, to, Vitamin, E., ...",1.0,2011-02-07,3,Vitamin e,Dietary Supplementation
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"("", Works, very, well, ., Using, 20, mg, it, u...",9.0,2013-05-05,64,Vardenafil,Erectile Dysfunction
4,0,0,1,0,1,0,1,0,1,0,...,1,0,0,0,"("", Had, mild, symptoms, that, a, yeast, infec...",8.0,2016-02-21,1,Tioconazole,Vaginal Yeast Infection


In [85]:
wc.columns

Index(['diagnose', 'break', 'painful', 'wonderful', 'mild', 'bc', 'pain',
       'make', '3', 'stomach',
       ...
       'find', 'fall', 'anxiety', 'life', 'text_sentence', 'rating', 'date',
       'usefulCount', 'drugName', 'condition'],
      dtype='object', length=408)

In [92]:
X = wc.drop(['text_sentence', 'rating', 'date','usefulCount', 'drugName', 'condition','positive'], 1)
y = wc['positive']

In [93]:
# to split a training and test sample
from sklearn.model_selection import train_test_split, cross_val_score
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [94]:
from sklearn.metrics import roc_auc_score

# How long will it take
from datetime import datetime
start_time = datetime.now()

# get rid of the warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1e9, max_iter = 150)
lr.fit(x_train, y_train)

# Set Up Predictor
ypred_lr = lr.predict(x_test)

print('Train Percentage accuracy:')
print(lr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lr.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(lr, x_train, y_train, cv = 5))

auc = roc_auc_score(y_test, ypred_lr)
print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

from sklearn.metrics import classification_report
print('\nClassification Report:\n')
print(classification_report(y_test, ypred_lr))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

Train Percentage accuracy:
0.7357388893556246

Test Percentage accuracy:
0.7403755276920169

Cross Validation:
[0.73079548 0.73415595 0.73063796 0.7381465  0.73020374]

Area Under Curve:
AUC: 0.740

Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.74      0.74     16048
           1       0.74      0.74      0.74     15694

   micro avg       0.74      0.74      0.74     31742
   macro avg       0.74      0.74      0.74     31742
weighted avg       0.74      0.74      0.74     31742


Duration: 0:00:10.750878


In [96]:
start_time = datetime.now()
# get rid of the warnings
import warnings
warnings.filterwarnings("ignore")

# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

#GridSearchCV for random forest 
param_grid = {'C':[1e9,.1,.5,1,3], 'max_iter':[25,50,100,200,300], 'penalty':['l1','l2']}

# Start the grid search again
grid_DT = GridSearchCV(lr, param_grid, cv=3, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:')
print(grid_DT.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.5min finished


Best score for data:
{'C': 0.5, 'max_iter': 100, 'penalty': 'l1'}


In [97]:
from sklearn.metrics import roc_auc_score

# How long will it take
from datetime import datetime
start_time = datetime.now()

# get rid of the warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=.5, max_iter=100, penalty='l1')
lr.fit(x_train, y_train)

# Set Up Predictor
ypred_lr = lr.predict(x_test)

print('Train Percentage accuracy:')
print(lr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lr.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(lr, x_train, y_train, cv = 5))

auc = roc_auc_score(y_test, ypred_lr)
print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

from sklearn.metrics import classification_report
print('\nClassification Report:\n')
print(classification_report(y_test, ypred_lr))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

Train Percentage accuracy:
0.7357598924640847

Test Percentage accuracy:
0.7399974796799194

Cross Validation:
[0.7309005  0.734576   0.73074298 0.73804148 0.72978366]

Area Under Curve:
AUC: 0.740

Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.74      0.74     16048
           1       0.74      0.74      0.74     15694

   micro avg       0.74      0.74      0.74     31742
   macro avg       0.74      0.74      0.74     31742
weighted avg       0.74      0.74      0.74     31742


Duration: 0:00:09.517911


### KNN

In [98]:
start_time = datetime.now()

from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=30)
knn.fit(x_train, y_train)
# Set Up Predictor
ypred_knn = knn.predict(x_test)

print('\nTrain Percentage accuracy:')
print(knn.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(knn.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(knn, x_train, y_train, cv = 5))

print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

print('\nClassification Report:\n')
print(classification_report(y_test, ypred_knn))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.6977862723683105

Test Percentage accuracy:
0.6574885010396321

Cross Validation:
[0.65182463 0.65035442 0.65271725 0.650932   0.65574459]

Area Under Curve:
AUC: 0.740

Classification Report:

              precision    recall  f1-score   support

           0       0.68      0.61      0.64     16048
           1       0.64      0.71      0.67     15694

   micro avg       0.66      0.66      0.66     31742
   macro avg       0.66      0.66      0.66     31742
weighted avg       0.66      0.66      0.66     31742


Duration: 3:08:58.201262


In [99]:
start_time = datetime.now()
from sklearn import tree

dec_tr = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=20,
    random_state = 1337
)

dec_tr.fit(x_train, y_train)
# Set Up Predictor
ypred_dec_tr = dec_tr.predict(x_test)

print('\nTrain Percentage accuracy:')
print(dec_tr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(dec_tr.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(dec_tr, x_train, y_train, cv = 5))

print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

print('\nClassification Report:\n')
print(classification_report(y_test, ypred_dec_tr))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.560898512979921

Test Percentage accuracy:
0.560046625921492

Cross Validation:
[0.55526385 0.55515883 0.5439748  0.55573641 0.5484142 ]

Area Under Curve:
AUC: 0.740

Classification Report:

              precision    recall  f1-score   support

           0       0.55      0.78      0.64     16048
           1       0.60      0.34      0.43     15694

   micro avg       0.56      0.56      0.56     31742
   macro avg       0.57      0.56      0.54     31742
weighted avg       0.57      0.56      0.54     31742


Duration: 0:00:02.898640


In [101]:
start_time = datetime.now()
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(n_estimators=50, max_depth=8)
rfc.fit(x_train, y_train)

ypred_rfc = rfc.predict(x_test)

print('\nTrain Percentage accuracy:')
print(rfc.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(rfc.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(rfc, x_train, y_train, cv = 5))

print('\nClassification Report:')
print(classification_report(y_test, ypred_rfc))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.7064920608250022

Test Percentage accuracy:
0.6989792703673366

Cross Validation:
[0.69440798 0.69209766 0.69782095 0.70191651 0.69649233]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.71      0.70     16048
           1       0.70      0.69      0.69     15694

   micro avg       0.70      0.70      0.70     31742
   macro avg       0.70      0.70      0.70     31742
weighted avg       0.70      0.70      0.70     31742


Duration: 0:00:20.795433


In [102]:
start_time = datetime.now()
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=10 , n_estimators=50 )
clf.fit(x_train, y_train)

ypred_clf = clf.predict(x_test)

print('\nTrain Percentage accuracy:')
print(clf.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(clf.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(clf, x_train, y_train, cv = 5))

print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

print('\nClassification Report:')
print(classification_report(y_test, ypred_clf))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.7976140468789381

Test Percentage accuracy:
0.7451956398462605

Cross Validation:
[0.73856655 0.74171699 0.73982673 0.74166448 0.73571729]

Area Under Curve:
AUC: 0.740

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.77      0.75     16048
           1       0.75      0.72      0.74     15694

   micro avg       0.75      0.75      0.75     31742
   macro avg       0.75      0.74      0.74     31742
weighted avg       0.75      0.75      0.75     31742


Duration: 1 day, 0:20:44.608515


In [104]:
# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

# Set the params
param_grid = {'n_estimators':[50,100,150,200], 'max_depth':[4,6,8,10],
              'loss':['deviance','exponential']}

# Start the grid search again
grid_DT = GridSearchCV(clf, param_grid, cv=2, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:', grid_DT.best_params_)

Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 89.6min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed: 184.2min finished


Best score for data: {'loss': 'exponential', 'max_depth': 10, 'n_estimators': 200}


In [105]:
start_time = datetime.now()
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=10 , n_estimators=200 )
clf.fit(x_train, y_train)

ypred_clf = clf.predict(x_test)

print('\nTrain Percentage accuracy:')
print(clf.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(clf.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(clf, x_train, y_train, cv = 5))

print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

print('\nClassification Report:')
print(classification_report(y_test, ypred_clf))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.8784445097874486

Test Percentage accuracy:
0.7879780732152983

Cross Validation:
[0.77642426 0.77757942 0.78025729 0.7818325  0.77830288]

Area Under Curve:
AUC: 0.740

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.80      0.79     16048
           1       0.79      0.78      0.78     15694

   micro avg       0.79      0.79      0.79     31742
   macro avg       0.79      0.79      0.79     31742
weighted avg       0.79      0.79      0.79     31742


Duration: 2:50:44.450230


____

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=common_words)

In [51]:
df = pd.DataFrame()
df['text_sentence'] = drugs['review_parsed']
df['str_sentence'] = [_.text for _ in df.text_sentence]
df['rating'] = drugs['rating']
df['positive'] = drugs['positive']
df['date'] = drugs['date']
df['usefulCount'] = drugs['usefulCount']
df['condition'] = drugs['condition']
df['drugName'] = drugs['drugName']

In [52]:
X = vectorizer.fit_transform(df['str_sentence'])

In [58]:
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [59]:
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
word_counts.head()

(154139, 410)


Unnamed: 0,text_sentence,str_sentence,rating,positive,date,usefulCount,condition,drugName,diagnose,break,...,care,develop,medication,suffer,today,walk,find,fall,anxiety,life
0,,,,,NaT,,,,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"("", My, son, is, halfway, through, his, fourth...","""My son is halfway through his fourth week of ...",8.0,1.0,2010-04-27,192.0,ADHD,Guanfacine,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
2,"("", I, used, to, take, another, oral, contrace...","""I used to take another oral contraceptive, wh...",5.0,0.0,2009-12-14,17.0,Birth Control,Lybrel,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"("", This, is, my, first, time, using, any, for...","""This is my first time using any form of birth...",8.0,1.0,2015-11-03,10.0,Birth Control,Ortho Evra,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"("", Suboxone, has, completely, turned, my, lif...","""Suboxone has completely turned my life around...",9.0,1.0,2016-11-27,37.0,Opiate Dependence,Buprenorphine / naloxone,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [60]:
#CHECK
word_counts.loc[1,'str_sentence']

'"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. We have tried many different medications and so far this is the most effective."'

In [57]:
df2.head()

Unnamed: 0,diagnose,break,painful,wonderful,mild,bc,pain,make,3,stomach,...,develop,medication,suffer,today,walk,find,fall,anxiety,life,str_sentence
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,"""My son is halfway through his fourth week of ..."
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"""I used to take another oral contraceptive, wh..."
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"""This is my first time using any form of birth..."
4,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,"""Suboxone has completely turned my life around..."


In [37]:
df.head()

Unnamed: 0,text_sentence,str_sentence,rating,positive,date,usefulCount,condition,drugName
77903,"("", The, patch, works, great, -, no, more, deb...","""The patch works great - no more debilitating ...",8.0,1,2017-11-07,0,Postmenopausal Symptoms,Climara Pro
102796,"("", Here, is, my, experience, with, 10, +, yea...","""Here is my experience with 10+ years of takin...",1.0,0,2017-05-27,20,Anxiety,Paroxetine
38606,"("", I, am, highly, allergic, to, Vitamin, E., ...","""I am highly allergic to Vitamin E. I had a fu...",1.0,0,2011-02-07,3,Dietary Supplementation,Vitamin e
65280,"("", Works, very, well, ., Using, 20, mg, it, u...","""Works very well. Using 20mg it usually starts...",9.0,1,2013-05-05,64,Erectile Dysfunction,Vardenafil
14500,"("", Had, mild, symptoms, that, a, yeast, infec...","""Had mild symptoms that a yeast infection was ...",8.0,1,2016-02-21,1,Vaginal Yeast Infection,Tioconazole


In [51]:
df2.shape

(126966, 402)

In [40]:
word_counts = df.merge(df2, on='text_sentence')
print(word_counts.shape)
word_counts.head()

(99793, 410)


Unnamed: 0,text_sentence,str_sentence,rating,positive_x,date,usefulCount,condition,drugName,diagnose,break,...,care,develop,medication,suffer,today,walk,find,fall,anxiety,life
0,"("", The, patch, works, great, -, no, more, deb...","""The patch works great - no more debilitating ...",8.0,1,2017-11-07,0,Postmenopausal Symptoms,Climara Pro,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"("", Here, is, my, experience, with, 10, +, yea...","""Here is my experience with 10+ years of takin...",1.0,0,2017-05-27,20,Anxiety,Paroxetine,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"("", I, am, highly, allergic, to, Vitamin, E., ...","""I am highly allergic to Vitamin E. I had a fu...",1.0,0,2011-02-07,3,Dietary Supplementation,Vitamin e,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"("", Works, very, well, ., Using, 20, mg, it, u...","""Works very well. Using 20mg it usually starts...",9.0,1,2013-05-05,64,Erectile Dysfunction,Vardenafil,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"("", Had, mild, symptoms, that, a, yeast, infec...","""Had mild symptoms that a yeast infection was ...",8.0,1,2016-02-21,1,Vaginal Yeast Infection,Tioconazole,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
#CHECK
word_counts.loc[0,'str_sentence']

'"The patch works great - no more debilitating hot flashes. But, I almost gave up on the medication because of the difficulty and frustration removing the plastic backing. I finally figured out how to get it off. I take a paring knife and in the middle, tease one side slowly off the adhesive until it is clear it will separate, but I don\'t remove it yet. Then I do the same thing on the other side. Once it is clear that both sides will come off easily, I remove one side completely, stick it in place, then remove the other side. Make sure your skin is stretched out with no wrinkles where you apply it. This has completely removed my frustration with the patch! If it weren\'t for the flawed design, I would rate it a 10, so instead it gets an 8"'

In [42]:
word_counts.loc[3].describe()

count     410
unique     11
top         0
freq      381
Name: 3, dtype: int64

In [48]:
word_counts['life'].head()

0    1
1    0
2    0
3    0
4    0
Name: life, dtype: int64