# UCI Medicine Review Analysis


1. The goal of this project is to go through hands on analysis on UCI ML Drug Review dataset available through Kaggle. In particular, I want to address the following questions, (partly due to the similarity to a question that I am currently working on and the lack the proper data to learn):
* What insights can we gain from exploring and visualizing our data?
* How does sentiment play into rating and usefulness of reviews?
* Can we create a way for people to find the best medication for their illness?
* What machine learning models work best for predicting the sentiment or rating based on review?
* Is this problem better suited for classification or regression? In other words, should we be trying to sort the reviews into categories based on sentiment or predict the actual rating of the review?
* What vectorization methods for the reviews are the most efficient and preserve the most data as well as allowing for the most accuracy? 
* Can we somehow find insight into what features or words are most important for predicting review rating?

## 1. Getting Started: Basic EDA

1. Read in the data sets
2. What are the columns, dimentions, missing data
3. why they were pre-set into test & train: are there any major differences

In [None]:
#import libraries
import numpy as np
import pandas as pd
#for NLP:
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")
# Word tokenization #lemma
from spacy.lang.en import English,stop_words 
#word viz
from spacy import displacy 
import string
#for the Pipiline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
from gensim.models import Word2Vec

from matplotlib import pyplot as plt # viz
%matplotlib inline

#to display entire text 
pd.set_option('display.max_colwidth', None)

In [None]:
#Read in the data
#convert date into correct format

train = pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv',  parse_dates=["date"]) 
test = pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTest_raw.csv',  parse_dates=["date"])

In [None]:
train.head()

In [None]:
#shape of train data, columns
print(train.shape)
train.columns

In [None]:
#shape of test data, columns
print(test.shape)
test.columns

In [None]:
#train and test share same columns
list(train) == list(test)

In [None]:
#missing values
train.isnull().sum()/train.shape[0],test.isnull().sum()/test.shape[0]

# only 0.5% of the data in both sets along the condition feature. 
# if the drug is available in the data set - the condition can be extrapolated
# else with such a small number - it can be exluded

In [None]:
#what drugs with missing conditions are missing and how to resolve it 
drugNoCond = train[train.condition.isnull()].drugName.unique()
print('Number of drugs with no condition description: ',len(drugNoCond))
print('Number of drugs that have no records with conditions at all:',
len(set(train[train.condition.isnull()].drugName) - set(train.drugName)))

In [None]:
#are there drugs with multiple conditions?
train.groupby('drugName').condition.nunique().mean()

In [None]:
#create a unique drug - condition list
drugCond = train[['drugName','condition']].drop_duplicates()
#check how many conditions have the drugs with missing ones
drugCond[drugCond.drugName.isin(drugNoCond)].groupby('drugName').condition.count().mean()

#cannot replace missing condition

In [None]:
#3
#what drugs are available in each set
print('Drugs in the train set: {}, test set: {}'.format(train.drugName.nunique(),test.drugName.nunique()))
#how many reviews per drug on average
print('average reviews per drug')
print(round(train.groupby('drugName').review.count().mean(),2),',',round(test.groupby('drugName').review.count().mean(),2))

#how many intersect in both sets
print('Drugs in train not in test: ',len(list(set(train.drugName.unique())-set(test.drugName.unique()))))
print('It is ',len(list(set(train.drugName.unique())-set(test.drugName.unique())))/len(train.drugName.unique()),'% of train')
print('Drugs in test not in train: ',len(list(set(test.drugName.unique())-set(train.drugName.unique()))))
print('It is ',len(list(set(test.drugName.unique())-set(train.drugName.unique())))/len(test.drugName.unique()),'% of test')


In [None]:
#are there any time differences
print('train date range: ', train.date.min(), train.date.max())
print('test date range: ', test.date.min(), test.date.max())
#same

In [None]:
#that is a weird data split
#join it back together
#delete the rows with missing condition
full_set = pd.concat([train,test])
full_set = full_set[~full_set.condition.isnull()]
full_set.shape

In [None]:
#how many drugs in total
print('Total drugs in the set ',full_set.drugName.nunique())
#how many conditiosn in total
print('Total conditions in the set ',full_set.condition.nunique())

In [None]:
#what is the drug count per condition?
print('Average drugs per condition ',full_set.groupby('condition').drugName.nunique().mean())
full_set.groupby('condition').drugName.nunique().sort_values(ascending = False).head(20)

In [None]:
#any condition/drug with special characters?
#exclude space and fw slash
#has ')' and '<' #173 rows
print('Percent of error text in condition: ', full_set[(~full_set.condition.str.isalnum())&(~full_set.condition.str.contains('/| |-|,'))].\
                                              shape[0]/full_set.shape[0])
print('Percent of "NOT LISTED" condition: ', full_set[full_set.condition=="Not Listed / Othe"].shape[0]/full_set.shape[0])
print('Percent of error text in drug: ', full_set[(~full_set.drugName.str.isalnum())&(~full_set.drugName.str.contains('/| |-|,'))].\
                                              shape[0]/full_set.shape[0])

In [None]:
#All the issues will affect further analysis and cannot be fixed 
#they make up to 2% of the data 
# delete rows

#if condition has special char like ) or <
#if condition in "NOT LISTED"

print(full_set.shape[0])
full_set = full_set[~full_set.condition.str.contains('\<|\?|\)|\(')]
print(full_set.shape[0])
full_set = full_set[full_set.condition!="Not Listed / Othe"]
print(full_set.shape[0])
print('final set to initial data is ',full_set.shape[0]/(train.shape[0]+test.shape[0]))

In [None]:
#create couple of useful columns for visualizations and further analysis
full_set.loc[full_set.rating<4,'rating_group']  = '-1'
full_set.loc[full_set.rating.between(4,8),'rating_group']  = '0'
full_set.loc[full_set.rating>=8,'rating_group']  = '1'

full_set['year'] = full_set['date'].dt.year

## 2. Data Visualizaton


In [None]:
#Top 10 reviewed & rated drugs
top_drugs = full_set.groupby('drugName').agg({'rating':['count','mean']}).reset_index()
top_drugs.columns = ['drugName','rating_count','rating_mean']
top_reviewed = top_drugs.sort_values(by = 'rating_count').tail(10)
top_rated = top_drugs[top_drugs.rating_count>50].sort_values(by = ['rating_mean']).tail(10)

In [None]:
top_reviewed.plot(kind='barh',y='rating_count',x='drugName',color='r',\
                 title = 'Top most reviewed drugs');

In [None]:
top_rated.plot(kind='barh',y='rating_mean',x='drugName',color='b',\
              title = 'Top rated drugs with more than 50 reviews');

In [None]:
#Top 10 reviewed & rated conditions
top_cond = full_set.groupby('condition').agg({'rating':['count','mean']}).reset_index()
top_cond.columns = ['condition','rating_count','rating_mean']
top_reviewed_cond = top_cond.sort_values(by = 'rating_count').tail(10)
top_rated_cond = top_cond[top_cond.rating_count>50].sort_values(by = ['rating_mean']).tail(10)

In [None]:
top_reviewed_cond.plot(kind='barh',y='rating_count',x='condition',color='r',\
                 title = 'Top conditions with most drug reviews');

In [None]:
top_rated_cond.plot(kind='barh',y='rating_mean',x='condition',color='r',\
                 title = 'Top conditions with highest treatment ratings');

In [None]:
#do people tend to leave more negative or positive reviews
full_set.groupby('rating_group').review.count()/full_set.shape[0]

In [None]:
#rating distro
full_set.rating.plot(kind = 'hist');

In [None]:
#create a list (cast into an array) containing the average usefulness for given ratings
use_ls = []

for i in range(1, 11):
    use_ls.append([i, np.sum(full_set[full_set.rating == i].usefulCount) / np.sum([full_set.rating == i])])
    
use_arr = np.asarray(use_ls)

In [None]:
# #correlation btween usefulness and rating
plt.scatter(use_arr[:, 0], use_arr[:, 1], c=use_arr[:, 0], cmap='tab10', s=200)
plt.title('Average Useful Count vs Rating')
plt.xlabel('Rating')
plt.ylabel('Average Useful Count')
plt.xticks([i for i in range(1, 11)]);

#positive reviews have higher chanse to be considered useful

In [None]:
#correlation btween usefulness and review length
x = full_set['review'].str.len()
y = full_set['usefulCount']
size = full_set['rating']
plt.scatter(x,y,s = size , alpha=0.5); #, c = full_set['rating_group']
plt.xlabel("log of review char length");
plt.ylabel("usefulness");
plt.xscale('log');
#seems like too long or too short reviews are less useful
#reviews between 100 - 1000 char are most useful - or at least most readable

In [None]:
# #reviews by year
counts = full_set.groupby('year').review.count().reset_index()
plt.bar(x = counts['year'], height = counts['review']);
plt.title('Review count by year');

## 3. Sentiment analysis

#### 3.1 Predict rating based on review
* Built the model for sentiment analysis
*  Evaluating the Model
* Check few models:
1. One vs Rest
2. SVC
3. Simple NN



After the model is trained test data through the pipeline to come up with predictions. 

<!-- Check performance of the model using such metrics as model’s accuracy, precision, and recall.

* *Accuracy* refers to the percentage of the total predictions our model makes that are completely correct.
* *Precision* describes the ratio of true positives to true positives plus false positives in our predictions.
* *Recall* describes the ratio of true positives to true positives plus false negatives in our predictions. -->

In [None]:
#Load stop words
stopwords = stop_words.STOP_WORDS
# Create our list of punctuation marks
punctuations = string.punctuation
# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [None]:
#since we are doing sentiment analysis on review positivity, it makes sence to keep negative stop words
stopwords = stopwords - set(["n't",'none','not', 'nothing','n‘t', 'n’t', 'no'])

In [None]:
full_set.review.head(3)
# looks like ' is replaced by &#039; - need to clean

In [None]:
#create custom tokennizer
def spacy_tokennizer(sent):
    #removing the &#039; combo:
    sent = sent.replace('&#039;',"'").replace('&quot;', '').replace('&amp;', '')
    #create token object
    parsed = nlp(sent)
    token_list = []
    for word in parsed:
        # Removing stop words
        if (word.lemma_ not in stopwords) and (word.lemma_ not in punctuations) and (word.pos_ !='PRON'):
            # Lemmatizing each token and converting each token into lowercase
            w=word.lemma_.lower().strip()
            token_list.append(w)
            
    # return preprocessed list of tokens
    return token_list

In [None]:
#split data into test and train
X = full_set['review']
y = full_set['rating_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 33)

In [None]:
#1 One vs Rest
pipeline_lg = Pipeline([
            ('vectorizer', CountVectorizer(tokenizer=spacy_tokennizer, ngram_range=(3,5))),
            ('tfidf', TfidfTransformer()),
            ('classifier', OneVsRestClassifier(SGDClassifier(loss='modified_huber', penalty='elasticnet',
                                          alpha=1e-4, random_state=42,
                                          shuffle=True, n_jobs=-1) )),
        ])

In [None]:
pipeline_lg.fit(X_train,y_train)

In [None]:
y_pred  = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
#2 SVC
# param_grid = {'C':np.arange(0.01,100,10)}
# clf = GridSearchCV(
#     svm.LinearSVC((), param_grid,cv =5)

pipeline_svm = Pipeline([
            ('vectorizer', CountVectorizer(tokenizer=spacy_tokennizer, ngram_range=(1,4))),
            ('tfidf', TfidfTransformer()),
            ('classifier', svm.LinearSVC()),
        ])
#

In [None]:
pipeline_svm.fit(X_train,y_train)
#print("Best estimator found by grid search:")
#print(pipeline_svm.best_estimator_)

In [None]:
y_pred_svm  = pipeline_svm.predict(X_test)
print(classification_report(y_test,y_pred_svm))

In [None]:
#3 NN
#Instead of using pipeline, need to enumerate the input first
X_full = pd.concat([X_train,X_test])
tr_shape = X_train.shape[0]
print(X_train.shape[0])
print(X_full.shape[0])
y_full = pd.concat([y_train,y_test])

vectorizer = CountVectorizer(binary=True, stop_words=stopwords,
                             lowercase=True, max_features=5000)
X_onehot = vectorizer.fit_transform(X_full)
print(X_onehot.toarray())

names_list = vectorizer.get_feature_names()
names = [[i] for i in names_list]
names = Word2Vec(names, min_count=1)
print(len(list(names.wv.vocab)))
print(list(names.wv.vocab)[:5])

y_onehot= keras.utils.to_categorical(y_full,3)
print(y_onehot)

In [None]:
#check which position is which type of rating
print(y_full[-2:-1])
print(y_onehot[-2:-1])
print("---")
print(y_full[-1:])
print(y_onehot[-1:])
print("---")
print(y_full[-4:-3])
print(y_onehot[-4:-3])

In [None]:

# # Separate data and one-hot encode the output
# # Note: We're also turning the data into numpy arrays, in order to train the model in Keras
X_train = X_onehot[:tr_shape]
X_test = X_onehot[tr_shape:]
y_train = y_onehot[:tr_shape]
y_test = y_onehot[tr_shape:]

features = np.array(X_train)
targets = np.array(y_train)
features_test = np.array(X_test)
targets_test = np.array(y_test)

model = Sequential()
model.add(Dense(64,  input_shape=(X_train.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(3, activation='softmax'))

# Compiling the model
model.compile(loss = 'categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
model.summary()

In [None]:
# Training the model
model.fit(X_train, y_train, epochs=200, batch_size=5000, verbose=0)

score = model.evaluate(X_train, y_train, verbose=0)
print("TRAIN Accuracy: ", score[1])

 ## 6. Evaluating the model
 #This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

score = model.evaluate(X_test, y_test, verbose=0)
print("TEST Accuracy: ", score[1])

####  Evaluating the Model
After the model is trained test data through the pipeline to come up with predictions. Check performance of the model using such metrics as model’s accuracy, precision, and recall.

* *Accuracy* refers to the percentage of the total predictions our model makes that are completely correct.
* *Precision* describes the ratio of true positives to true positives plus false positives in our predictions.
* *Recall* describes the ratio of true positives to true positives plus false negatives in our predictions.

### Medicine Recommendation System
* The original idea was to find another dataset with medicine ingredient list and provide a personalised recommedation based on user previous history (alergies, side effects) or based on possible negative side effects of adding a new drug to existing prescriptions. However, I was not able to find a dataset that would provide enough information to implement this.
* This is a simple recommender system based on medicine rating and prevalance

The data has columns rating, usefulCount and date. I want to use all three to sort the drugs for recommendation. To create a list of drugs for each condion
    * a small coefficient is applied to the rating depending how old the review is to favor recent reviews
    * another coefficient is applied to the rating based on the usefulness to favor more useful reviews
    * the rating mean is calculated for each drug & condition



In [None]:
def simple_recommender_with_adj(c, n = 10,df = full_set):
    '''
    input:
    c - illness/disorder/condition
    n - number of requested drugs
    df - data set to work with
    
    output - recommended top n treatment drugs 
    '''
    alpha =  df.year/df.groupby('condition').year.transform('max') #adjustment for review age
    beta  =   (df.usefulCount/df.groupby('condition').usefulCount.transform('max')).fillna(0)
    
    df['rating_adj'] = (df.rating*alpha +beta)*10/(np.max(df.rating*alpha +beta)) #normalizing back to 10 point scale
    
    #filter for conditions with at least three drugs & at least 5 reviews
    cond = df.groupby('condition').drugName.count()
    rev =  df.groupby('drugName').review.count()
    rec_set = df[(df.condition.isin(cond[cond>2].index))&(df.drugName.isin(rev[rev>5].index))]
    

    rec_set = rec_set.groupby(['condition','drugName']).rating_adj.mean().reset_index().\
                sort_values(by = ['condition', 'rating_adj'], ascending = False)
    final = rec_set.loc[rec_set.condition ==c,['drugName','rating_adj']].drop_duplicates().reset_index()
    return final.drugName.head(n)


In [None]:
simple_recommender_with_adj('Acne',10)

In [None]:
def sentiment_recommender(c,d,r, df = full_set, vectorizer = vectorizer):
    '''
    input:
    c - illness/disorder/condition
    d - drug
    r - patient review
    n - number of requested drugs
    df - full set from earlier
    vectorizer - from the sentiment analysis
    
    output - recommended top n treatment drugs 
    '''
    # check a new review rating:
    #turn string into series
    r = pd.Series(r)
    new_one_hot = vectorizer.transform(r)
    #model.predict(new_one_hot)  # [neutral, positive, negative]
    pos = np.argmax(model.predict(new_one_hot))
    rec = simple_recommender_with_adj(c)
    #get drug index
    ind = rec.index[rec==d].to_numpy()[0] 
    if pos ==1:
      #positive review
      print('It is wonderful you are happy with your meds.')
      if ind >0:
        #exclude the current drug from the list
        print("Here are some other great meds to keep in mind")
      rec = rec[rec != d].head(3)
      
    else :
      #negative/ neutral review
      print("Looks like the medicine didn't satisfy your needs." )
      print("Here is what you can consider")

      #get current drug index and recomment only ones that are ranked higher
      #return top 5
      if ind >0:
        rec = rec[rec.index<ind].head(5)
      #if index == 0
      #recomend the next two
      else:
        rec = rec[rec != d].head(2)
    return rec.values

In [None]:
r = 'Not working'#'wonderful amazing works like a charm'
d = 'Bactrim'
c = 'Acne'

sentiment_recommender(c,d,r)

## Medicine similarity
What Meds are more similar to each other.
Can it be wrapped in to another recommender?


In [None]:
#there are total 3635 drugs - that is quite a lot. how many only have less than 10 reviews?
med_counts = full_set.groupby('drugName').review.nunique().\
                    sort_values(ascending = False).reset_index().rename(columns = {'review':'COUNTS'})

med_counts.shape[0] - med_counts[med_counts.COUNTS<=10].shape[0]

In [None]:
# filter to more than 20 reviews per drug
# and for useful count >50
keep = (full_set.groupby('drugName').review.nunique()>20).reset_index()
keep =keep[keep.review == True].drugName.values

#combine all reviews by  count>20 per drug
all_revs =full_set[(full_set.drugName.isin(keep))&(full_set.usefulCount>50)]
drugs= all_revs['drugName'].unique()
print('drug ',drugs.shape[0])
reviews = all_revs.groupby(['drugName'])['review'].agg(lambda col: ''.join(col)).reset_index()
print('reviews ',reviews.shape[0])

In [None]:
#Create a similarity matrix for drugs
def create_similarity_matrix( df = reviews):
      '''
    input:
    c - illness/disorder/condition
    d - drug
    n - number of requested drugs
    df - reviews df filtered by review count & useful count criteria
    
    output - recommended top n treatment drugs 
    '''

    #df = df[df.drugName == d]
    drugs = df.drugName.unique()
    vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,4))
    X = vectorizer.fit_transform(df['review'].tolist())
     
    similarity_matrix = pd.DataFrame(linear_kernel(X, X), index = drugs, columns =drugs)
    return similarity_matrix

In [None]:
def similarity_matrix_recommender(d,c,n=10,df = reviews, all_revs = all_revs):
    '''
    input:
    c - illness/disorder/condition
    d - drug
    n - number of requested drugs
    df - reviews df filtered by review count & useful count criteria from earlier
    all_revs - df where drug has a single row with all reviews combined together
    output - recommended top n treatment drugs 
    '''
    drug_matrix = create_similarity_matrix()[d].sort_values(ascending = False)
    #get top n closely matching drugs
    final = all_revs[(all_revs.drugName.isin(drug_matrix.index[1:n]))&(all_revs.condition==c)].groupby(['drugName','condition']).rating.mean()
    final = final.sort_values(ascending = False).reset_index()
    #final.drugName.values
    return final

In [None]:
d = 'Accutane'
c = 'Acne'
n = 10
similarity_matrix_recommender(d,c,n)

In [None]:
d = None

d==None

In [None]:
#combine all together
def medicine_recommender(c, d = None, r= None, n = 3, df = full_set):
  '''
  input:
  c - condition patient is diagnosed with
  d - drug name, if currently using any
  r - patients review/ feedback

  output: top n drugs to try
  '''
  #case 1 - new diagnosis
  #use simple recommender
  if d==None:
    rec = simple_recommender_with_adj(c,n)

  #if patient has a history of drugs.
  #get drug review (d,r)
  #use sentiment recommender
  else:
    rec = sentiment_recommender(c,d,r)
  return rec

In [None]:
c = 'Acne'
d = 'Bactrim'
r = 'Slowly working. Terrible side effects'
medicine_recommender(c, d = d, r= r, n = n)