In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

data from: https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29

In [2]:
drugs = pd.read_csv('data/drugsComTrain_raw.tsv', delimiter='\t')

drugs.date = pd.to_datetime(drugs.date)
drugs = drugs.rename({'Unnamed: 0':'id'}, axis=1)

drugs.head()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,2010-04-27,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,2016-11-27,37


In [3]:
drugs.shape

(161297, 7)

In [4]:
drugs.drugName.value_counts()

Levonorgestrel                        3657
Etonogestrel                          3336
Ethinyl estradiol / norethindrone     2850
Nexplanon                             2156
Ethinyl estradiol / norgestimate      2117
Ethinyl estradiol / levonorgestrel    1888
Phentermine                           1543
Sertraline                            1360
Escitalopram                          1292
Mirena                                1242
Implanon                              1102
Gabapentin                            1047
Bupropion                             1022
Venlafaxine                           1016
Miconazole                            1000
Medroxyprogesterone                    995
Citalopram                             995
Lexapro                                952
Bupropion / naltrexone                 950
Duloxetine                             934
Metronidazole                          922
Contrave                               920
Drospirenone / ethinyl estradiol       890
Depo-Prover

In [5]:
drugs.rating.value_counts()

10.0    50989
9.0     27531
1.0     21619
8.0     18890
7.0      9456
5.0      8013
2.0      6931
3.0      6513
6.0      6343
4.0      5012
Name: rating, dtype: int64

Most of the ratings are either a 8, 9 or 10 star. About 1/8 are 1 star.

In [6]:
drugs.condition.value_counts()

Birth Control                                            28788
Depression                                                9069
Pain                                                      6145
Anxiety                                                   5904
Acne                                                      5588
Bipolar Disorde                                           4224
Insomnia                                                  3673
Weight Loss                                               3609
Obesity                                                   3568
ADHD                                                      3383
Diabetes, Type 2                                          2554
Emergency Contraception                                   2463
High Blood Pressure                                       2321
Vaginal Yeast Infection                                   2274
Abnormal Uterine Bleeding                                 2096
Bowel Preparation                                      

In [7]:
drugs[['drugName','condition','rating','review']].loc[drugs.rating == 10]

Unnamed: 0,drugName,condition,rating,review
7,Aripiprazole,Bipolar Disorde,10.0,"""Abilify changed my life. There is hope. I was..."
11,L-methylfolate,Depression,10.0,"""I have taken anti-depressants for years, with..."
18,Lamotrigine,Bipolar Disorde,10.0,"""I&#039;ve been on every medicine under the su..."
19,Nilotinib,Chronic Myelogenous Leukemia,10.0,"""I have been on Tasigna for just over 3 years ..."
21,Trazodone,Insomnia,10.0,"""I have insomnia, it&#039;s horrible. My story..."
23,Etanercept,Rheumatoid Arthritis,10.0,"""I live in Western Australia and disturbed by ..."
26,Eflornithine,Hirsutism,10.0,"""I&#039;m writing a second review on Vaniqa. ..."
27,Daytrana,ADHD,10.0,"""Hi all, My son who is 12 was diagnosed when h..."
30,Azithromycin,,10.0,"""Very good response. It is so useful for me. """
32,Toradol,Pain,10.0,"""I am 30 years old. I had a multiple composite..."


In [8]:
drugs[['drugName','condition','rating','review']].loc[drugs.condition.isnull()]

Unnamed: 0,drugName,condition,rating,review
30,Azithromycin,,10.0,"""Very good response. It is so useful for me. """
148,Urea,,10.0,"""Accurate information."""
488,Doxepin,,10.0,"""So far so good. Good for me and I can take it..."
733,Ethinyl estradiol / norgestimate,,8.0,"""I haven&#039;t been on it for a long time and..."
851,Medroxyprogesterone,,6.0,"""I started the shot in July 2015 and ended in ..."
1014,Acetaminophen / caffeine,,10.0,"""I get migraine and have found out by taking e..."
1124,Tavaborole,,10.0,"""I have struggled with nail for 8 or ten years..."
1163,Acetaminophen / butalbital / caffeine / codeine,,5.0,"""I found that while this medicine does relieve..."
1253,Ethinyl estradiol / norethindrone,,4.0,"""I started Loestrin and within two months I ex..."
1267,Conjugated estrogens,,10.0,"""I had to have a total hysterectomy in 2009 in..."


In [9]:
drugs['drugName'].loc[drugs.drugName.str.contains('/')].value_counts()

Ethinyl estradiol / norethindrone                          2850
Ethinyl estradiol / norgestimate                           2117
Ethinyl estradiol / levonorgestrel                         1888
Bupropion / naltrexone                                      950
Drospirenone / ethinyl estradiol                            890
Ethinyl estradiol / etonogestrel                            635
Magnesium sulfate / potassium sulfate / sodium sulfate      626
Sulfamethoxazole / trimethoprim                             527
Desogestrel / ethinyl estradiol                             522
Acetaminophen / hydrocodone                                 498
Buprenorphine / naloxone                                    437
Amphetamine / dextroamphetamine                             419
Adapalene / benzoyl peroxide                                399
Microgestin Fe 1 / 20                                       392
Ethinyl estradiol / norelgestromin                          374
Acetaminophen / oxycodone               

In [10]:
# Check the null values and put into dataframe for readability. 
nullvals = pd.DataFrame(drugs.isna().sum())
nullvals

Unnamed: 0,0
id,0
drugName,0
condition,899
review,0
rating,0
date,0
usefulCount,0


In [11]:
drugs = drugs.dropna()

In [12]:
drugs['condition'] = drugs['condition'].apply(lambda text: re.sub("(Disorde)", "Disorder", text))

# pull out the values that contain a span tag
span_values = pd.DataFrame(drugs['drugName'].loc[drugs.condition.str.contains('</span>')].value_counts())
span_values['name'] = span_values.index

# replace the spans with the most common condition for it. 
for name in span_values['name']:
    drugs['condition'].loc[(drugs.drugName==name)&
                           (drugs.condition.str.contains('</span>'))] = drugs['condition'].loc[drugs.drugName == name].value_counts().idxmax()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
# these should probably be the drugs where there only value contains the span tag
drugs['drugName'].loc[drugs.condition.str.contains('</span>')].value_counts()

femhrt                                 1
Nor-QD                                 1
Lotrimin AF Athlete's Foot Powder      1
Fluoridex                              1
Hair Regrowth Treatment for Women      1
Lescol                                 1
Uroqid-Acid No2                        1
Pepcid AC Chewable Tablets             1
Darvocet-N 50                          1
Nystop                                 1
Spectracef                             1
Alavert D-12 Hour Allergy and Sinus    1
Taztia XT                              1
Blephamide                             1
Latisse                                1
Dantrium                               1
Regimex                                1
Nutropin                               1
Maxidex                                1
Rogaine Men's Extra Strength           1
Aerobid-M                              1
Fragmin                                1
Fleet Phospho Soda                     1
Gadavist                               1
Orapred         

In [14]:
drugs = drugs.loc[drugs.condition != drugs.condition.str.contains('</span>')]

In [15]:
drugs

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,2010-04-27,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,2016-11-27,37
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,2015-11-28,43
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1.0,2017-03-07,5
7,102654,Aripiprazole,Bipolar Disorder,"""Abilify changed my life. There is hope. I was...",10.0,2015-03-14,32
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1.0,2016-08-09,11
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8.0,2016-12-08,1


In [16]:
drugs[['drugName','condition','review']].loc[(drugs.drugName=='Mirena')]

Unnamed: 0,drugName,condition,review
162,Mirena,Birth Control,"""I am 29 and just had my second Mirena inserte..."
190,Mirena,Birth Control,"""I love my Mirena. I&#039;m due in February to..."
547,Mirena,Birth Control,"""I am 22, no prior children, I have endometrio..."
656,Mirena,Abnormal Uterine Bleeding,"""Just got my IUD placed today! I was honestly ..."
742,Mirena,Birth Control,"""I have had a wonderful experience with mirena..."
1126,Mirena,Birth Control,"""I&#039;m 18 and I&#039;ve had the Mirena for ..."
1218,Mirena,Birth Control,"""I&#039;ve had the Mirena for more than a year..."
1918,Mirena,Abnormal Uterine Bleeding,"""I was put on blood thinners due to a PE and h..."
2043,Mirena,Birth Control,"""Mirena has been greaat for me. We have one ch..."
2676,Mirena,Birth Control,"""I had hair loss and lack of appetite when I h..."


In [17]:
drugs[['drugName','condition','rating','review']].loc[drugs.drugName.str.contains('Ethinyl estradiol')]

Unnamed: 0,drugName,condition,rating,review
9,Ethinyl estradiol / levonorgestrel,Birth Control,8.0,"""I had been on the pill for many years. When m..."
69,Ethinyl estradiol / norgestimate,Acne,9.0,"""Best treatment for acne I have used! I&#039;v..."
117,Ethinyl estradiol / levonorgestrel,Birth Control,9.0,"""I love Lutera. I am very sensitive to other f..."
127,Ethinyl estradiol / levonorgestrel,Birth Control,10.0,"""I&#039;ve been on Jolessa for 6 months. I de..."
154,Ethinyl estradiol / norgestimate,Birth Control,5.0,"""I wrote my expirence with this pill before, b..."
163,Ethinyl estradiol / norelgestromin,Birth Control,10.0,"""This is absolutely the best birth control I h..."
186,Ethinyl estradiol / norgestimate,Birth Control,3.0,"""I don&#039;t think I noticed this at first or..."
225,Ethinyl estradiol / norelgestromin,Birth Control,10.0,"""The first day I used it, I felt terribly sick..."
226,Ethinyl estradiol / norethindrone,Birth Control,1.0,"""This birthcontrol is terrible! I am 28 years ..."
239,Ethinyl estradiol / norethindrone,Birth Control,9.0,"""No side effects, very light periods. On my 3r..."


In [18]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub("(&#039;)", "'", text)
    text = re.sub("(&amp;)", "&", text)
    text = ' '.join(text.split())
    return text

drugs.review = drugs.review.apply(lambda x: text_cleaner(x))

In [19]:
print(drugs.review[533])

"Have been on the patch for 5 months now & it's just awful, I feel nauseous all the time & get headaches and the dizziness that comes along with it. My periods are heavy & have painful cramping, some in between bleeding"


### Balance the Data

In [20]:
drugs['positive'] = np.where(drugs['rating']>7,1,0)

In [21]:
drugs['positive'].value_counts()

1    96915
0    63483
Name: positive, dtype: int64

In [22]:
drugs.rating.value_counts()

10.0    50745
9.0     27379
1.0     21504
8.0     18791
7.0      9395
5.0      7959
2.0      6879
3.0      6465
6.0      6301
4.0      4980
Name: rating, dtype: int64

In [23]:
drugs.condition.value_counts().head(15)

Birth Control                29195
Depression                    9114
Pain                          6170
Anxiety                       5928
Acne                          5597
Bipolar Disorder              4255
Insomnia                      3685
Weight Loss                   3616
Obesity                       3584
ADHD                          3408
Diabetes, Type 2              2558
Emergency Contraception       2467
High Blood Pressure           2327
Vaginal Yeast Infection       2274
Abnormal Uterine Bleeding     2098
Name: condition, dtype: int64

In [24]:
drugs_s = drugs.sample(frac=1, random_state=40)

conditions = pd.DataFrame(drugs.condition.value_counts().head(15))

drugs = pd.DataFrame()
for i in conditions.index:
    drugs = drugs.append(drugs_s.loc[drugs_s.condition==i][:4980])

#drugs = pd.concat([positive, not_pos])
#drugs = drugs.sample(frac=1, random_state=40)

#compare = pd.DataFrame()
#compare['five'] = drugs.positive.loc[drugs.positive==1].describe()
#compare['not'] = drugs.positive.loc[drugs.positive==0].describe()
#compare

drugs.condition.value_counts()

Pain                         4980
Anxiety                      4980
Acne                         4980
Depression                   4980
Birth Control                4980
Bipolar Disorder             4255
Insomnia                     3685
Weight Loss                  3616
Obesity                      3584
ADHD                         3408
Diabetes, Type 2             2558
Emergency Contraception      2467
High Blood Pressure          2327
Vaginal Yeast Infection      2274
Abnormal Uterine Bleeding    2098
Name: condition, dtype: int64

In [25]:
# how long it will take and hopefully speed some stuff up
from tqdm import tqdm
tqdm.pandas()

# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
# add parsed reviews into new column
drugs['review_parsed'] = drugs.review.progress_apply(nlp)

100%|██████████| 55172/55172 [17:33<00:00, 52.36it/s]


#### Create the Bag of Words

In [26]:
nlp = spacy.load('en')
from collections import Counter

# Utility function to create a list of the 200 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(300)]

common_words = pd.DataFrame()
nlp.max_length = 1500000

# loop to run NLP through all conditions / create bag of words
for i in conditions.index:
    doc = ' '.join(drugs.review.loc[drugs.condition==i][0:2500].astype(str))
    doc = nlp(doc)
    words = bag_of_words(doc)
    common_words = common_words.append(words)

common_words.head()

Unnamed: 0,0
0,-PRON-
1,period
2,be
3,not
4,month


In [27]:
common_list = set(common_words[0])

len(common_list)

1092

#### Create features from the common words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=common_list)

In [29]:
drugs.head(2)

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,positive,review_parsed
152677,148521,Mirena,Birth Control,"""First one worked fine, no periods and no preg...",3.0,2017-05-10,8,0,"("", First, one, worked, fine, ,, no, periods, ..."
66136,73701,Ethinyl estradiol / norethindrone,Birth Control,"""I was on Microgestin Fe for 8 months. At firs...",3.0,2012-11-07,10,0,"("", I, was, on, Microgestin, Fe, for, 8, month..."


In [30]:
drugs = drugs.reset_index(drop=True)

df = pd.DataFrame()
df['text_sentence'] = drugs['review_parsed']
df['text_source'] = drugs['condition']
df['usefulCount'] = drugs['usefulCount']
df['rating'] = drugs['rating']
#String version of the spacy objects
df['str_sentence'] = [_.text for _ in df.text_sentence]

In [31]:
X = vectorizer.fit_transform(df['str_sentence'])

In [32]:
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [33]:
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
word_counts.head()

(55172, 1097)


Unnamed: 0,text_sentence,text_source,usefulCount,rating,str_sentence,$,'s,-PRON-,.75,0.5,...,yes,yesterday,yi,yogurt,yr,zit,zoloft,zolpidem,zombie,zyprexa
0,"("", First, one, worked, fine, ,, no, periods, ...",Birth Control,8,3.0,"""First one worked fine, no periods and no preg...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"("", I, was, on, Microgestin, Fe, for, 8, month...",Birth Control,10,3.0,"""I was on Microgestin Fe for 8 months. At firs...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"("", A, pro, of, this, BC, is, that, it, helped...",Birth Control,3,8.0,"""A pro of this BC is that it helped clear my a...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"("", I, started, Norethindrone, back, in, Janua...",Birth Control,0,5.0,"""I started Norethindrone back in January, here...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"("", I, have, had, the, Mirena, for, 5, months,...",Birth Control,8,1.0,"""I have had the Mirena for 5 months. I HATE IT...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
word_counts.text_source.value_counts()

Pain                         4980
Anxiety                      4980
Acne                         4980
Depression                   4980
Birth Control                4980
Bipolar Disorder             4255
Insomnia                     3685
Weight Loss                  3616
Obesity                      3584
ADHD                         3408
Diabetes, Type 2             2558
Emergency Contraception      2467
High Blood Pressure          2327
Vaginal Yeast Infection      2274
Abnormal Uterine Bleeding    2098
Name: text_source, dtype: int64

In [25]:
drugs.head()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,positive,review_parsed
77903,139941,Climara Pro,Postmenopausal Symptoms,"""The patch works great - no more debilitating ...",8.0,2017-11-07,0,1,"("", The, patch, works, great, -, no, more, deb..."
102796,196782,Paroxetine,Anxiety,"""Here is my experience with 10+ years of takin...",1.0,2017-05-27,20,0,"("", Here, is, my, experience, with, 10, +, yea..."
38606,13204,Vitamin e,Dietary Supplementation,"""I am highly allergic to Vitamin E. I had a fu...",1.0,2011-02-07,3,0,"("", I, am, highly, allergic, to, Vitamin, E., ..."
65280,12969,Vardenafil,Erectile Dysfunction,"""Works very well. Using 20mg it usually starts...",9.0,2013-05-05,64,1,"("", Works, very, well, ., Using, 20, mg, it, u..."
14500,214206,Tioconazole,Vaginal Yeast Infection,"""Had mild symptoms that a yeast infection was ...",8.0,2016-02-21,1,1,"("", Had, mild, symptoms, that, a, yeast, infec..."


## Modeling

#### Set up extra features

In [36]:
# How many of each type of word is there?
word_counts['cnt_verbs'] = [sum([1 for x in j if x.pos_ == 'VERB']) for j in word_counts.text_sentence]
word_counts['cnt_adj'] = [sum([1 for x in j if x.pos_ == 'ADJ']) for j in word_counts.text_sentence]
word_counts['cnt_prop'] = [sum([1 for x in j if x.pos_ == 'PROPN']) for j in word_counts.text_sentence]
word_counts['cnt_punct'] = [sum([1 for x in j if x.pos_ == 'PUNCT']) for j in word_counts.text_sentence]
word_counts['cnt_adv'] = [sum([1 for x in j if x.pos_ == 'ADV']) for j in word_counts.text_sentence]
word_counts['cnt_nouns'] = [sum([1 for x in j if x.pos_ == 'NOUN']) for j in word_counts.text_sentence]
word_counts['crude_sentiment'] = [j.sentiment for j in word_counts.text_sentence]

In [70]:
word_counts.columns

Index(['text_sentence', 'text_source', 'usefulCount', 'rating', 'str_sentence',
       '$', ''s', '-PRON-', '.75', '0.5',
       ...
       'zolpidem', 'zombie', 'zyprexa', 'cnt_verbs', 'cnt_adj', 'cnt_prop',
       'cnt_punct', 'cnt_adv', 'cnt_nouns', 'crude_sentiment'],
      dtype='object', length=1104)

In [37]:
word_counts['text_source'] = word_counts['text_source'].astype('category')

#### Declare and Fit the models

In [41]:
X = word_counts.drop(['text_sentence','str_sentence', 'text_source'], 1)
y = word_counts['text_source']

In [42]:
# to split a training and test sample
from sklearn.model_selection import train_test_split, cross_val_score
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [47]:
from sklearn.metrics import roc_auc_score
from datetime import datetime

start_time = datetime.now()
from sklearn import tree

dec_tr = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=20,
    max_depth=60,
    random_state = 1337
)

dec_tr.fit(x_train, y_train)
# Set Up Predictor
ypred_dec_tr = dec_tr.predict(x_test)

print('\nTrain Percentage accuracy:')
print(dec_tr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(dec_tr.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(dec_tr, x_train, y_train, cv = 5))

#auc = roc_auc_score(y_test, ypred_dec_tr)
#print('\nArea Under Curve:')
#print('AUC: %.3f' % auc)

from sklearn.metrics import classification_report
print('\nClassification Report:\n')
print(classification_report(y_test, ypred_dec_tr))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.935982019865149

Test Percentage accuracy:
0.6169071268034511

Cross Validation:
[0.55555556 0.58019324 0.55570324 0.56642089 0.56614268]

Classification Report:

                           precision    recall  f1-score   support

                     ADHD       0.72      0.69      0.70       851
Abnormal Uterine Bleeding       0.64      0.65      0.64       534
                     Acne       0.72      0.75      0.73      1217
                  Anxiety       0.54      0.53      0.54      1270
         Bipolar Disorder       0.57      0.58      0.57      1088
            Birth Control       0.56      0.52      0.54      1231
               Depression       0.47      0.46      0.47      1238
         Diabetes, Type 2       0.62      0.61      0.61       644
  Emergency Contraception       0.82      0.82      0.82       637
      High Blood Pressure       0.61      0.57      0.59       539
                 Insomnia       0.59      0.62      0.60       960
  

In [75]:
start_time = datetime.now()
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(n_estimators=50, max_depth=8)
rfc.fit(x_train, y_train)

ypred_rfc = rfc.predict(x_test)

print('\nTrain Percentage accuracy:')
print(rfc.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(rfc.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(rfc, x_train, y_train, cv = 5))

print('\nClassification Report:')
print(classification_report(y_test, ypred_rfc))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.6596742230170622

Test Percentage accuracy:
0.6500398752990647

Cross Validation:
[0.64359903 0.65483092 0.62960725 0.64885773 0.65151149]

Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.92      0.68      0.78       851
Abnormal Uterine Bleeding       0.97      0.05      0.10       539
                     Acne       0.81      0.93      0.87      1217
                  Anxiety       0.69      0.68      0.68      1270
         Bipolar Disorder       0.87      0.50      0.64      1088
            Birth Control       0.55      0.77      0.64      1231
               Depression       0.53      0.53      0.53      1238
         Diabetes, Type 2       0.89      0.46      0.61       644
  Emergency Contraception       0.96      0.83      0.89       637
      High Blood Pressure       0.74      0.59      0.65       535
                 Insomnia       0.69      0.72      0.70       960
  

In [78]:
# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

#GridSearchCV for random forest 
param_grid = {'n_estimators':[30,50,100,150,300,500,700,1000], 'max_depth':[4,6,8,10,15,20]}

# Start the grid search again
grid_DT = GridSearchCV(rfc, param_grid, cv=3, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:', grid_DT.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 18.9min finished


Best score for data: {'max_depth': 20, 'n_estimators': 1000}


In [79]:
# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

#GridSearchCV for random forest 
param_grid = {'n_estimators':[1000,1200,1500], 'max_depth':[20,25,30,50]}

# Start the grid search again
grid_DT = GridSearchCV(rfc, param_grid, cv=3, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:', grid_DT.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 45.6min finished


Best score for data: {'max_depth': 50, 'n_estimators': 1500}


In [80]:
# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

#GridSearchCV for random forest 
param_grid = {'n_estimators':[1500,1700,2000], 'max_depth':[50,75,100]}

# Start the grid search again
grid_DT = GridSearchCV(rfc, param_grid, cv=3, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:', grid_DT.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 777.0min finished


Best score for data: {'max_depth': 100, 'n_estimators': 2000}


In [81]:
# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

#GridSearchCV for random forest 
param_grid = {'n_estimators':[2000,2500,3000], 'max_depth':[100,200,500]}

# Start the grid search again
grid_DT = GridSearchCV(rfc, param_grid, cv=3, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of  the grid search
# View the accuracy score
print('Best score for data:', grid_DT.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 133.6min finished


Best score for data: {'max_depth': 100, 'n_estimators': 2000}


In [82]:
start_time = datetime.now()
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(n_estimators=2000, max_depth=100)
rfc.fit(x_train, y_train)

ypred_rfc = rfc.predict(x_test)

print('\nTrain Percentage accuracy:')
print(rfc.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(rfc.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(rfc, x_train, y_train, cv = 5))

print('\nClassification Report:')
print(classification_report(y_test, ypred_rfc))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.9999758325680314

Test Percentage accuracy:
0.845138838541289

Cross Validation:
[0.82971014 0.82246377 0.81691843 0.82908256 0.82636034]

Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.93      0.90      0.92       851
Abnormal Uterine Bleeding       0.93      0.75      0.83       539
                     Acne       0.92      0.94      0.93      1217
                  Anxiety       0.82      0.80      0.81      1270
         Bipolar Disorder       0.86      0.82      0.84      1088
            Birth Control       0.81      0.88      0.84      1231
               Depression       0.72      0.75      0.74      1238
         Diabetes, Type 2       0.93      0.88      0.90       644
  Emergency Contraception       0.97      0.96      0.96       637
      High Blood Pressure       0.88      0.81      0.85       535
                 Insomnia       0.81      0.85      0.83       960
   

In [83]:
start_time = datetime.now()
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier(loss='deviance', max_depth=10 , n_estimators=50 )
clf.fit(x_train, y_train)

ypred_clf = clf.predict(x_test)

print('\nTrain Percentage accuracy:')
print(clf.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(clf.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(clf, x_train, y_train, cv = 5))

#print('\nArea Under Curve:')
#print('AUC: %.3f' % auc)

print('\nClassification Report:')
print(classification_report(y_test, ypred_clf))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.9528735076610759

Test Percentage accuracy:
0.8271587036902777

Cross Validation:
[0.81557971 0.80615942 0.80024169 0.81119304 0.81100363]

Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.93      0.86      0.89       851
Abnormal Uterine Bleeding       0.84      0.77      0.81       539
                     Acne       0.92      0.94      0.93      1217
                  Anxiety       0.80      0.78      0.79      1270
         Bipolar Disorder       0.70      0.80      0.75      1088
            Birth Control       0.83      0.82      0.82      1231
               Depression       0.72      0.76      0.74      1238
         Diabetes, Type 2       0.93      0.87      0.90       644
  Emergency Contraception       0.97      0.95      0.96       637
      High Blood Pressure       0.92      0.81      0.86       535
                 Insomnia       0.83      0.82      0.83       960
  

## Unsupervised

In [89]:
word_counts.columns

Index(['text_sentence', 'text_source', 'usefulCount', 'rating', 'str_sentence',
       '$', ''s', '-PRON-', '.75', '0.5',
       ...
       'zolpidem', 'zombie', 'zyprexa', 'cnt_verbs', 'cnt_adj', 'cnt_prop',
       'cnt_punct', 'cnt_adv', 'cnt_nouns', 'crude_sentiment'],
      dtype='object', length=1104)

In [103]:
X = word_counts.drop(['text_sentence', 'text_source', 'usefulCount', 'str_sentence'],axis=1)

In [104]:
X_norm = normalize(X)
# Create the two-feature PCA for graphing purposes.
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_norm)

# Split the data into four equally-sized samples. First we break it in half:
X_half1, X_half2, X_pcahalf1, X_pcahalf2 = train_test_split(
    X_norm,
    X_pca,
    test_size=0.5,
    random_state=42)

# Then we halve the halves.
X1, X2, X_pca1, X_pca2 = train_test_split(
    X_half1,
    X_pcahalf1,
    test_size=0.5,
    random_state=42)

X3, X4, X_pca3, X_pca4 = train_test_split(
    X_half2,
    X_pcahalf2,
    test_size=0.5,
    random_state=42)

# Check the length of each set
print(len(X1), len(X_pca1))
print(len(X2), len(X_pca2))
print(len(X3), len(X_pca3))
print(len(X4), len(X_pca4))

13793 13793
13793 13793
13793 13793
13793 13793


## K Means

In [107]:
# Initialize data frames
ypred_df = pd.DataFrame()
scores_df = pd.DataFrame(columns=['cluster_pred','sil_score'])

for counter, data in enumerate([
    (X1, X_pca1),
    (X2, X_pca2),
    (X3, X_pca3),
    (X4, X_pca4)]):
    
    # Put the features into ypred.
    ypred_df['pca_f1' + '_sample' + str(counter)] = data[1][:, 0]
    ypred_df['pca_f2' + '_sample' + str(counter)] = data[1][:, 1]

    
    # Creating a list of possible number of clusters to test in kmeans.
    for n_cluster in range(2, 10): 
        # Instantiating and fit_predicting model to then add to data frame
        kmeans = KMeans(n_clusters=n_cluster, random_state=42)
        pred = kmeans.fit_predict(data[0])
        ypred_df['cluster ' + str(n_cluster) + '_sample ' + str(counter)] = pred
        # Calculating silhouette scores for the data and adding that to the shilouette score
        labels = kmeans.labels_
        sscore = metrics.silhouette_score(data[0], labels, metric='euclidean')
        scores_df = scores_df.append({'cluster_pred':'cluster ' + str(n_cluster) + '_sample ' + str(counter), 
                              'sil_score':sscore}, ignore_index=True)

In [108]:
# Sorting sihoilette scores
scores_df.sort_values(by='sil_score', ascending=False)

Unnamed: 0,cluster_pred,sil_score
24,cluster 2_sample 3,0.352351
8,cluster 2_sample 1,0.350675
0,cluster 2_sample 0,0.348169
16,cluster 2_sample 2,0.34326
17,cluster 3_sample 2,0.118702
25,cluster 3_sample 3,0.117703
1,cluster 3_sample 0,0.117622
9,cluster 3_sample 1,0.11623
18,cluster 4_sample 2,0.111788
10,cluster 4_sample 1,0.109754


Silhouette scores here are disappointingly low. It might be the large amount of features not explaining enough of variance in the data. It could also be the preprocessing of the data not working efficiently enough

## Mean Shift

In [109]:
from sklearn import metrics
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import pairwise_distances

ypred_ms_df = pd.DataFrame()
score_ms_df = pd.DataFrame(columns=['cluster_pred','sil_score', 'quantile'])

# Keep track of counts of the models and use data from the different folds
for counter, data in enumerate([X1, X2, X3, X4]):
    # Creating a list of possible quantiles to test in mean shift.
    for n in [0.1, 0.2, 0.3]:
        # Estimating number of clusters for data
        bandwidth = estimate_bandwidth(data, quantile=n, n_samples=500)
        
        # Instantiating and fit_predicting model to then add to data frame
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        pred = ms.fit_predict(data)
        labels = ms.labels_
        cntrs = len(np.unique(labels))
        ypred_ms_df['cluster ' + str(cntrs) + '_sample ' + str(counter)] = pred
        # Calculating silhouette scores for the data and adding that to the shilouette score
        sscore = metrics.silhouette_score(data, labels, metric='euclidean')
        score_ms_df = score_ms_df.append({'cluster_pred':'cluster ' + str(cntrs) + '_sample ' + str(counter), 
                              'sil_score':sscore, 'quantile':n}, ignore_index=True)


In [110]:
score_ms_df.sort_values(by='sil_score', ascending=False)

Unnamed: 0,cluster_pred,sil_score,quantile
2,cluster 9_sample 0,0.302578,0.3
1,cluster 10_sample 0,0.282475,0.2
8,cluster 13_sample 2,0.251911,0.3
11,cluster 6_sample 3,0.245346,0.3
5,cluster 16_sample 1,0.223858,0.3
7,cluster 21_sample 2,0.220973,0.2
10,cluster 12_sample 3,0.219041,0.2
4,cluster 23_sample 1,0.189063,0.2
6,cluster 26_sample 2,0.1777,0.1
0,cluster 30_sample 0,0.166414,0.1


## Spectral Clustering

In [111]:
from sklearn.cluster import SpectralClustering
# Initialize data frames
ypred_sc = pd.DataFrame()
score_sc = pd.DataFrame(columns=['cluster_pred','silhouette_score'])

# Keep track of counts of the models and use data from the different folds
for counter, data in enumerate([
    (X1, X_pca1),
    (X2, X_pca2),
    (X3, X_pca3),
    (X4, X_pca4)]):
    
    # Put the features into ypred.
    ypred_sc['pca_f1' + '_sample' + str(counter)] = data[1][:, 0]
    ypred_sc['pca_f2' + '_sample' + str(counter)] = data[1][:, 1]
    
    # Creating a list of possible number of clusters to test in kmeans.
    for nclust in range(2, 6):
        # Instantiating and fit_predicting model to then add to data frame
        sc = SpectralClustering(n_clusters=nclust)
        pred = sc.fit_predict(data[0])
        ypred_sc['clust' + str(nclust) + '_sample' + str(counter)] = pred
        # Calculating silhouette scores for the data and adding that to the shilouette score
        labels = sc.labels_
        sscore_sc = metrics.silhouette_score(data[0], labels, metric='euclidean')
        score_sc = score_sc.append({'cluster_pred':'clust' + str(nclust) + '_sample' + str(counter), 
                              'silhouette_score':sscore_sc}, ignore_index=True)

In [112]:
score_sc.sort_values(by='silhouette_score', ascending=False)

Unnamed: 0,cluster_pred,silhouette_score
4,clust2_sample1,0.406995
12,clust2_sample3,0.405195
8,clust2_sample2,0.401914
0,clust2_sample0,0.400405
1,clust3_sample0,0.118673
9,clust3_sample2,0.117901
5,clust3_sample1,0.11739
13,clust3_sample3,0.117285
6,clust4_sample1,0.095074
14,clust4_sample3,0.094831


In [None]:
# Calculate predicted values.
y_pred = SpectralClustering(n_clusters=2).fit_predict(X_norm)

In [None]:
#Creating the dataframe for the cluster
df_clust = pd.DataFrame(y_pred)
df_clust.columns = ['Cluster']

In [None]:
# Combining the cluster to X values
df_comb = X.join(df_clust, how='inner')
df_comb.head()

## TF-IDF

In [38]:
word_counts.head()

Unnamed: 0,text_sentence,text_source,usefulCount,rating,str_sentence,$,'s,-PRON-,.75,0.5,...,zolpidem,zombie,zyprexa,cnt_verbs,cnt_adj,cnt_prop,cnt_punct,cnt_adv,cnt_nouns,crude_sentiment
0,"("", First, one, worked, fine, ,, no, periods, ...",Birth Control,8,3.0,"""First one worked fine, no periods and no preg...",0,0,0,0,0,...,0,0,0,16,6,1,14,7,12,0.0
1,"("", I, was, on, Microgestin, Fe, for, 8, month...",Birth Control,10,3.0,"""I was on Microgestin Fe for 8 months. At firs...",0,0,0,0,0,...,0,0,0,15,3,2,9,10,13,0.0
2,"("", A, pro, of, this, BC, is, that, it, helped...",Birth Control,3,8.0,"""A pro of this BC is that it helped clear my a...",0,0,0,0,0,...,0,0,0,19,17,3,13,9,25,0.0
3,"("", I, started, Norethindrone, back, in, Janua...",Birth Control,0,5.0,"""I started Norethindrone back in January, here...",0,0,0,0,0,...,0,0,0,29,6,6,27,16,18,0.0
4,"("", I, have, had, the, Mirena, for, 5, months,...",Birth Control,8,1.0,"""I have had the Mirena for 5 months. I HATE IT...",0,0,0,0,0,...,0,0,0,20,10,1,19,7,25,0.0


In [48]:
tf_df = pd.DataFrame()
tf_df['str_sentence'] = [i.text for i in word_counts.text_sentence]
tf_df['text_source'] = word_counts.text_source
tf_df['usefulCount'] = word_counts.usefulCount
tf_df['rating'] = word_counts.rating

tf_df.head()

Unnamed: 0,str_sentence,text_source,usefulCount,rating
0,"""First one worked fine, no periods and no preg...",Birth Control,8,3.0
1,"""I was on Microgestin Fe for 8 months. At firs...",Birth Control,10,3.0
2,"""A pro of this BC is that it helped clear my a...",Birth Control,3,8.0
3,"""I started Norethindrone back in January, here...",Birth Control,0,5.0
4,"""I have had the Mirena for 5 months. I HATE IT...",Birth Control,8,1.0


In [80]:
X = tf_df['str_sentence']
y = tf_df['text_source']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=.75, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
speeches_tfidf=vectorizer.fit_transform(tf_df['str_sentence'])
print("Number of features: %d" % speeches_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf= train_test_split(speeches_tfidf, y, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]

#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()

#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

# Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present 
# once in that sentence.
print('Original sentence:', X_train)
print('Tf_idf vector:', tfidf_bypara[5])

Number of features: 20220
Original sentence: 1070     "I've been taking this bc for a few years now....
36216    "I started phentermine on 2/10/2015... weighed...
3746     "Dr suggested for heavy periods and almost ana...
38654    "I am just about at the one month mark using S...
47383    "Hey everyone, unlike everyone else on here, I...
44497    "I have been on Trulicity now for a month, and...
45753    "My doctor put me on Jardiance and I had an al...
34800    "Lost 9 lbs the first week but I am very irrit...
31213    "I've used ambien for a couple months, probabl...
44634    "It's this simple... Invokana was great at red...
16873    "Avoid Benzos. I'm an alcoholic now, thanks to...
49495    "Coughing and runny nose driving me crazy. I'm...
47482    "Well of course the condom broke inside of me....
48853    "I tried some others before getting this one a...
49785    "Vasotec controlled my blood pressure but the ...
7260     "I know many women taking this had good result...
18148    "S

In [81]:
y_train

1070                 Birth Control
36216                  Weight Loss
3746                 Birth Control
38654                      Obesity
47383      Emergency Contraception
44497             Diabetes, Type 2
45753             Diabetes, Type 2
34800                  Weight Loss
31213                     Insomnia
44634             Diabetes, Type 2
16873                      Anxiety
49495          High Blood Pressure
47482      Emergency Contraception
48853          High Blood Pressure
49785          High Blood Pressure
7260                    Depression
18148                      Anxiety
27050             Bipolar Disorder
53826    Abnormal Uterine Bleeding
3882                 Birth Control
36017                  Weight Loss
34611                  Weight Loss
25483             Bipolar Disorder
27735             Bipolar Disorder
17634                      Anxiety
47737      Emergency Contraception
2417                 Birth Control
2887                 Birth Control
25692             Bi

In [82]:
X_train

1070     "I've been taking this bc for a few years now....
36216    "I started phentermine on 2/10/2015... weighed...
3746     "Dr suggested for heavy periods and almost ana...
38654    "I am just about at the one month mark using S...
47383    "Hey everyone, unlike everyone else on here, I...
44497    "I have been on Trulicity now for a month, and...
45753    "My doctor put me on Jardiance and I had an al...
34800    "Lost 9 lbs the first week but I am very irrit...
31213    "I've used ambien for a couple months, probabl...
44634    "It's this simple... Invokana was great at red...
16873    "Avoid Benzos. I'm an alcoholic now, thanks to...
49495    "Coughing and runny nose driving me crazy. I'm...
47482    "Well of course the condom broke inside of me....
48853    "I tried some others before getting this one a...
49785    "Vasotec controlled my blood pressure but the ...
7260     "I know many women taking this had good result...
18148    "Since I was put on 10mg =Valium 3x a day it h.

In [83]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#X_train = X_train.reset_index()
#y_train = y_train.reset_index()

abs_by_component=pd.DataFrame(X_train_lsa)
abs_by_component['str_sentence'] = X_train
abs_by_component['text_source'] = y_train

abs_by_component.head()
#for i in range(5):
#    print('Component {}:'.format(i))
#    print(abs_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 22.935606683193438


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,122,123,124,125,126,127,128,129,str_sentence,text_source
0,0.485838,0.24278,-0.081554,0.158113,0.006388,-0.053998,0.006522,-0.027098,-0.059111,-0.006935,...,-0.064232,0.003035,0.144092,0.014379,-0.0106,-0.026813,0.05334,-0.046622,,
1,0.22802,0.037843,-0.120486,-0.305882,0.231735,0.085303,0.005628,-0.025811,-0.078615,-0.046621,...,0.016521,-0.057428,0.079453,-0.02209,-0.020547,0.09788,-0.001871,0.02324,,
2,0.294598,0.283966,-0.101724,0.053427,0.054746,-0.19259,0.046771,-0.242145,0.140978,0.158333,...,0.002947,-0.081501,-0.035217,0.082399,0.007594,0.095018,0.032486,0.003985,"""A pro of this BC is that it helped clear my a...",Birth Control
3,0.439202,-0.137371,-0.101727,-0.221112,0.104614,0.216871,0.019983,-0.003614,-0.023045,-0.030498,...,0.002282,-0.022671,-0.005625,0.005452,-0.053984,0.076172,0.028266,0.000401,,
4,0.304555,0.337294,-0.244157,0.248174,-0.162094,0.099875,-0.117023,0.215177,-0.01009,-0.076966,...,-0.135408,0.066197,-0.114636,-0.053537,-0.067428,-0.037156,0.009484,-0.102314,"""I have had the Mirena for 5 months. I HATE IT...",Birth Control


In [86]:
nullvals = pd.DataFrame(abs_by_component.isna().sum())
nullvals

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [88]:
abs_by_component.shape

(33103, 132)

In [75]:
y_train

1070                 Birth Control
36216                  Weight Loss
3746                 Birth Control
38654                      Obesity
47383      Emergency Contraception
44497             Diabetes, Type 2
45753             Diabetes, Type 2
34800                  Weight Loss
31213                     Insomnia
44634             Diabetes, Type 2
16873                      Anxiety
49495          High Blood Pressure
47482      Emergency Contraception
48853          High Blood Pressure
49785          High Blood Pressure
7260                    Depression
18148                      Anxiety
27050             Bipolar Disorder
53826    Abnormal Uterine Bleeding
3882                 Birth Control
36017                  Weight Loss
34611                  Weight Loss
25483             Bipolar Disorder
27735             Bipolar Disorder
17634                      Anxiety
47737      Emergency Contraception
2417                 Birth Control
2887                 Birth Control
25692             Bi

In [76]:
abs_by_component['text_source'] = y_train
#abs_by_component = abs_by_component.reset_index()

abs_by_component.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,126,127,128,129,text_source
str_sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""I've been taking this bc for a few years now. Other pills made me really sick so I was excited to find a pill that worked. My boobs got ginormous (one more than the other haha) but that goes away after a while. When I started I would always get my period in the middle of the pack... I tried stopping the pills and restarting to get the period at the right time but nope it stayed in the middle. I do have to say it was really really light and maybe two days max. No cramping whatsoever. I was such a happy camper bc without bc I'd feel like I was gonna pass out from the pain. So after a year or so my periods completely stopped. That was alarming at first and I kept taking tests too make sure I wasn't pregnant. I don't worry any more.""",0.492877,0.246296,-0.08273,0.16041,0.006524,-0.054771,0.006702,-0.027496,-0.060258,-0.007057,...,-0.032664,0.058732,-0.108265,0.017336,0.04094,-0.017114,0.007623,-0.097376,0.170308,
"""I started phentermine on 2/10/2015... weighed in at a whopping 282 pounds...first week and I've dropped 9 pounds.... I love it only 93 more pounds to go to reach my goal. will post again in two weeks!""",0.229938,0.038161,-0.121509,-0.308463,0.233681,0.086017,0.005574,-0.025828,-0.079311,-0.047297,...,-0.076578,0.068034,0.037008,0.009919,0.061315,-0.016194,0.00117,-0.024848,-0.081295,
"""Dr suggested for heavy periods and almost anaemic. One month no bleed but then the bleed started and has now been two months of on/off light to heavy with minor to major clotting. Had ultrasound and have two fibroids. Tummy upset, cramping, fatigue and loads of money spent on liners, tampons and pads. As I took this for heavy bleeding and low iron levels, it has done nothing but cause trouble. Seeing a Gyno today for info on surgery as a permanent solution... but then I am 47 and blessed with children. BIG RISKS to consider if you are deciding on this horrible shot!""",0.285805,0.275487,-0.098693,0.051836,0.053118,-0.186823,0.045291,-0.234814,0.136804,0.153531,...,-0.15177,-0.088656,-0.05543,-0.040554,-0.031019,0.06149,-0.055299,0.017934,-0.219887,
"""I am just about at the one month mark using Saxenda, started April 1. My start weight was 243 and I currently am 229.5, a loss of 13.5 lbs. I am happy to report no major side effects with the exception of very minor constipation and slight headache once in a blue moon! I started the 3 mg. dose last night and feel great. I have a protein shake for breakfast and usually take a little bit of grilled vegetables to work for lunch that my wife makes in the QVC air fryer. I have a piece of fish with some more veggies for dinner. This drug totally wipes out your appetite! You crave nothing. I also mix in 15 mins. of walking each morning on the treadmill and 1 hour at night at 3 mph. You must exercise because the drug is not magic. A definite 10.""",0.432189,-0.13518,-0.100116,-0.217597,0.102935,0.213442,0.019646,-0.003442,-0.022791,-0.030155,...,0.04165,-0.048737,-0.007884,0.011005,-0.079052,-0.008347,0.005311,-0.070273,0.063459,
"""Hey everyone, unlike everyone else on here, I'm a guy and I'd like to tell you ladies to RELAX. My girlfriend and I had sex while she was ovulating and the condom broke and I came completely inside her. We got plan B about 45 minutes later, and obviously we were both freaking out because we're still youngish. I felt like the odds were against us, because we both had teen parents so I was guessing we were both pretty fertile. But, she got her period today (6 days later). It was lighter than normal and early but I know those are both symptoms of plan B. This pill is 95% effective so stop worrying, you aren't pregnant :)""",0.306523,0.33947,-0.245724,0.249784,-0.163142,0.100546,-0.117827,0.216369,-0.010122,-0.077185,...,0.047579,0.143419,-0.087656,-0.070312,0.001406,0.107697,0.04098,-0.04216,0.073387,


In [60]:
abs_by_component

In [None]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])

In [45]:
start_time = datetime.now()
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier(loss='deviance', max_depth=10 , n_estimators=50 )
clf.fit(x_train, y_train)

ypred_clf = clf.predict(x_test)

print('\nTrain Percentage accuracy:')
print(clf.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(clf.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(clf, x_train, y_train, cv = 5))

#print('\nArea Under Curve:')
#print('AUC: %.3f' % auc)

print('\nClassification Report:')
print(classification_report(y_test, ypred_clf))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

KeyboardInterrupt: 

In [104]:
# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

# Set the params
param_grid = {'n_estimators':[50,100,150,200], 'max_depth':[4,6,8,10]}

# Start the grid search again
grid_DT = GridSearchCV(clf, param_grid, cv=2, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:', grid_DT.best_params_)

Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 89.6min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed: 184.2min finished


Best score for data: {'loss': 'exponential', 'max_depth': 10, 'n_estimators': 200}


In [105]:
start_time = datetime.now()
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=10 , n_estimators=200 )
clf.fit(x_train, y_train)

ypred_clf = clf.predict(x_test)

print('\nTrain Percentage accuracy:')
print(clf.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(clf.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(clf, x_train, y_train, cv = 5))

print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

print('\nClassification Report:')
print(classification_report(y_test, ypred_clf))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))


Train Percentage accuracy:
0.8784445097874486

Test Percentage accuracy:
0.7879780732152983

Cross Validation:
[0.77642426 0.77757942 0.78025729 0.7818325  0.77830288]

Area Under Curve:
AUC: 0.740

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.80      0.79     16048
           1       0.79      0.78      0.78     15694

   micro avg       0.79      0.79      0.79     31742
   macro avg       0.79      0.79      0.79     31742
weighted avg       0.79      0.79      0.79     31742


Duration: 2:50:44.450230


In [109]:
from sklearn.metrics import roc_auc_score

# How long will it take
from datetime import datetime
start_time = datetime.now()

# get rid of the warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1e9, max_iter = 150)
lr.fit(x_train, y_train)

# Set Up Predictor
ypred_lr = lr.predict(x_test)

print('Train Percentage accuracy:')
print(lr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lr.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(lr, x_train, y_train, cv = 5))

auc = roc_auc_score(y_test, ypred_lr)
print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

from sklearn.metrics import classification_report
print('\nClassification Report:\n')
print(classification_report(y_test, ypred_lr))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

KeyboardInterrupt: 

In [96]:
start_time = datetime.now()
# get rid of the warnings
import warnings
warnings.filterwarnings("ignore")

# Grid Search CV for decision tree
from sklearn.model_selection import GridSearchCV

#GridSearchCV for random forest 
param_grid = {'C':[1e9,.1,.5,1,3], 'max_iter':[25,50,100,200,300], 'penalty':['l1','l2']}

# Start the grid search again
grid_DT = GridSearchCV(lr, param_grid, cv=3, verbose=1, n_jobs=-1)

grid_DT.fit(x_train, y_train)

# summarize the results of the grid search
# View the accuracy score
print('Best score for data:')
print(grid_DT.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.5min finished


Best score for data:
{'C': 0.5, 'max_iter': 100, 'penalty': 'l1'}


In [97]:
from sklearn.metrics import roc_auc_score
from datetime import datetime

# How long will it take
start_time = datetime.now()

# get rid of the warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=.5, max_iter=100, penalty='l1')
lr.fit(x_train, y_train)

# Set Up Predictor
ypred_lr = lr.predict(x_test)

print('Train Percentage accuracy:')
print(lr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lr.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(lr, x_train, y_train, cv = 5))

auc = roc_auc_score(y_test, ypred_lr)
print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

from sklearn.metrics import classification_report
print('\nClassification Report:\n')
print(classification_report(y_test, ypred_lr))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

Train Percentage accuracy:
0.7357598924640847

Test Percentage accuracy:
0.7399974796799194

Cross Validation:
[0.7309005  0.734576   0.73074298 0.73804148 0.72978366]

Area Under Curve:
AUC: 0.740

Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.74      0.74     16048
           1       0.74      0.74      0.74     15694

   micro avg       0.74      0.74      0.74     31742
   macro avg       0.74      0.74      0.74     31742
weighted avg       0.74      0.74      0.74     31742


Duration: 0:00:09.517911


### KNN

In [111]:
start_time = datetime.now()

from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=30)
knn.fit(x_train, y_train)
# Set Up Predictor
ypred_knn = knn.predict(x_test)

print('\nTrain Percentage accuracy:')
print(knn.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(knn.score(x_test, y_test))

print('\nCross Validation:')
print(cross_val_score(knn, x_train, y_train, cv = 5))

print('\nArea Under Curve:')
print('AUC: %.3f' % auc)

print('\nClassification Report:\n')
print(classification_report(y_test, ypred_knn))

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

KeyboardInterrupt: 

____