In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
df = pd.read_csv('./drug_reviews_cleaned.csv')
df.head(2)

Unnamed: 0,User_ID,drugName,condition,review,rating,date,usefulCount,ratings_simplified
0,164952,Phentermine / topiramate,Weight Loss,"""I have been on the Qysmia for 3 weeks now. I...",8.0,"December 25, 2015",38,3
1,145900,Qsymia,Weight Loss,"""My Dr agreed to over see putting me on Qsymia...",9.0,"February 24, 2013",46,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3609 entries, 0 to 3608
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   User_ID             3609 non-null   int64  
 1   drugName            3609 non-null   object 
 2   condition           3609 non-null   object 
 3   review              3609 non-null   object 
 4   rating              3609 non-null   float64
 5   date                3609 non-null   object 
 6   usefulCount         3609 non-null   int64  
 7   ratings_simplified  3609 non-null   int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 225.7+ KB


## Basic Model

In [5]:
X = df['review']
y = df['ratings_simplified']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)

In [7]:
y_train.value_counts(normalize=True)

4    0.638581
3    0.159645
1    0.115299
2    0.086475
Name: ratings_simplified, dtype: float64

Our models will have to beat about 64% to beat a null model (given stratify=y, that's how well predicting 4 every time would do in terms of the accuracy score).

In [8]:
pipe_nb = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_nb.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [9]:
pipe_nb.score(X_test, y_test)

0.743078626799557

## Let's GridSearch for the best hyperparameters

In [10]:
grid = {
    "countvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "countvectorizer__lowercase": [True, False]
}

In [12]:
gs = GridSearchCV(pipe_nb, grid)
gs.fit(X_train, y_train)
gs.score(X_test, y_test)

0.7796234772978959

In [13]:
gs.best_params_

{'countvectorizer__lowercase': False, 'countvectorizer__ngram_range': (1, 3)}

## Make a model with TFIDF

In [14]:
pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [15]:
pipe.score(X_test, y_test)

0.6400885935769657

That didn't work well, let's abandon it for now.

## Let's Lemmatize

In [23]:
# install if needed
# %pip install  -U nltk
# install data if needed - opens interactive installer see http://www.nltk.org/data.html
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
def split_into_lemmas(text):
    '''return lowercased, lemmatizeed list of words as a string from a document passed in '''
   
    text = text.lower()
    lemmer = WordNetLemmatizer()
    return ' '.join([lemmer.lemmatize(word) for word in text.split() ])

In [28]:
pipe = make_pipeline(CountVectorizer(preprocessor=split_into_lemmas), MultinomialNB())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7353266888150609

Looks promising. Let's gridsearch

In [29]:
grid = {
    "countvectorizer__ngram_range": [ (1, 3)],
    "countvectorizer__lowercase": [True, False],
    "multinomialnb__alpha": [.1, .5, 1, 10]
}

In [30]:
gs = GridSearchCV(pipe, grid)
gs.fit(X_train, y_train)
gs.score(X_test, y_test)

0.805094130675526

In [31]:
gs.best_params_

{'countvectorizer__lowercase': True,
 'countvectorizer__ngram_range': (1, 3),
 'multinomialnb__alpha': 0.1}

In [32]:
gs.best_estimator_

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(ngram_range=(1, 3),
                                 preprocessor=<function split_into_lemmas at 0x1295cbdc0>)),
                ('multinomialnb', MultinomialNB(alpha=0.1))])

Note that lemmatization is helping.

## Let's try with LogisticRegression

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
pipe_lgr = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_lgr.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [35]:
pipe_lgr.score(X_test, y_test)

0.743078626799557

### Let's try with BoostingClassifier

In [36]:
from sklearn.ensemble import GradientBoostingClassifier

In [37]:
pipe_gb = make_pipeline(CountVectorizer(), GradientBoostingClassifier()) # random forest
pipe_gb.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('gradientboostingclassifier', GradientBoostingClassifier())])

In [38]:
pipe_gb.score(X_test, y_test)

0.7043189368770764

### Let's try a VotingClassifier 
with BoostingClassifier, LogisticRegression, and MultinomialNB

In [39]:
from sklearn.ensemble import VotingClassifier

In [40]:
vc = VotingClassifier([
    ('pgb', pipe_gb),
    ('plgr', pipe_lgr),
    ('pnb', gs)
])

In [41]:
vc.fit(X_train, y_train)
vc.score(X_test, y_test)

0.7585825027685493

## Let's add the other two columns

In [42]:
X_2 = df[['review', 'drugName', 'condition']]
y_2 = df['ratings_simplified']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, random_state=123, stratify=y)

In [44]:
X_2.head()

Unnamed: 0,review,drugName,condition
0,"""I have been on the Qysmia for 3 weeks now. I...",Phentermine / topiramate,Weight Loss
1,"""My Dr agreed to over see putting me on Qsymia...",Qsymia,Weight Loss
2,"""I just started Adipex-P 37.5mg on March 6th, ...",Adipex-P,Weight Loss
3,"""Began taking Qsymia 3.75 in June 2013. Its on...",Qsymia,Weight Loss
4,"""Made me jittery and could not sleep. Lost wei...",Qsymia,Weight Loss


In [45]:
X_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3609 entries, 0 to 3608
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     3609 non-null   object
 1   drugName   3609 non-null   object
 2   condition  3609 non-null   object
dtypes: object(3)
memory usage: 84.7+ KB


In [46]:
X_2['condition'].value_counts()

Weight Loss    3609
Name: condition, dtype: int64

Condition has no variance. so forget about it.

In [47]:
X_2['drugName'].value_counts()

Phentermine                 1211
Lorcaserin                   387
Belviq                       381
Bupropion / naltrexone       267
Contrave                     263
Adipex-P                     255
Phentermine / topiramate     249
Qsymia                       207
Liraglutide                  166
Saxenda                       95
Victoza                       63
Fastin                        35
Megestrol                     11
Ionamin                        7
Belviq XR                      3
Megace ES                      3
Phentercot                     2
T-Diet                         1
Phentride                      1
Megace                         1
Lomaira                        1
Name: drugName, dtype: int64

In [48]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [49]:
ct = make_column_transformer(
    (CountVectorizer(preprocessor=split_into_lemmas, ngram_range = (1,1)), 'review'),
    (OneHotEncoder(handle_unknown='ignore', sparse=False), ['drugName'])
)

In [50]:
pipe = make_pipeline(ct, MultinomialNB())

In [51]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('countvectorizer',
                                                  CountVectorizer(preprocessor=<function split_into_lemmas at 0x1295cbdc0>),
                                                  'review'),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['drugName'])])),
                ('multinomialnb', MultinomialNB())])

In [52]:
pipe.score(X_test, y_test)

0.7419712070874862

No benefit from the `drugName` column.

## Next Steps

- GridSearch more over LogisticRegression and GradientBoostingClassifier - or maybe use RandomizedSearchCV for faster iteration. 
- Add KNN and other models, potentially. 
- Add a stermmer maybe.
- Could look at metrics beyond accuracy. 
- Could oversample the minority class

## Summary

Lemmatization with a Naive Bayes classifier with these params did best on accuracy:

  ```
  'countvectorizer__ngram_range': (1, 3),
  'multinomialnb__alpha': 0.1
  ```
    
- scored about 80% on the test set.    