In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
df = pd.read_csv('data/webmd.csv')
df['Date'] = df['Date'].astype('datetime64[ns]')
df = df.drop_duplicates('Reviews')
df_ost = df.loc[df['Condition'].str.contains("Osteoporosis", case=False)]
df_ost.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1357 entries, 4147 to 360371
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Age            1357 non-null   object        
 1   Condition      1357 non-null   object        
 2   Date           1357 non-null   datetime64[ns]
 3   Drug           1357 non-null   object        
 4   DrugId         1357 non-null   int64         
 5   EaseofUse      1357 non-null   int64         
 6   Effectiveness  1357 non-null   int64         
 7   Reviews        1357 non-null   object        
 8   Satisfaction   1357 non-null   int64         
 9   Sex            1357 non-null   object        
 10  Sides          1357 non-null   object        
 11  UsefulCount    1357 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(6)
memory usage: 137.8+ KB


In [3]:
bin_age = []

for age in df_ost['Age']:
    if age == '7-12':
        bin_age.append('0-44')
    elif age == '13-18':
        bin_age.append('0-44')
    elif age == '19-24':
        bin_age.append('0-44')
    elif age == '25-34':
        bin_age.append('0-44')
    elif age == '35-44':
        bin_age.append('0-44')
    elif age == '45-54':
        bin_age.append('45-54')
    elif age == '55-64':
        bin_age.append('55-64')
    elif age == '65-74':
        bin_age.append('65-74')
    elif age == '75 or over':
        bin_age.append('75+')
    else:
        #Combine missing values into the most frequent bin
        bin_age.append('')

df_ost['Age'] = bin_age


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ost['Age'] = bin_age


# Injectables and Non-Injectables

In [4]:
injectable_drugs = ['prolia syringe','reclast bottle, infusion','zometa vial','zoledronic acid vial', 'tymlos pen injector']



infused_drugs = ['liquid calcium + vitamin d', 'oyster shell + d', 'atelvia','actonel tablet osteoporosis agents','actonel 35 mg tablet','calcium citrate 200 mg (950 mg) tablet',

       'calcium 600 mg (1,500 mg) tablet', 'calcium 600 + vitamin d',

       'calcitrate + vit d', 'citracal-vitamin d tablet',

       'citracal + d', 'citracal + d maximum', 'c calcium',

       'calcium-magnesium-zinc 333 mg-133 mg-5 mg tablet', 'vitamin d3 tablet',

 'vitamin d3', 'os-cal 500-vit d3', 'risedronate sodium', 'bio-d-mulsion forte drops', 'risedronate sodium', 'replesta',

 'duavee','viactiv', 'actical','calcitrate', 'drisdol', 'calcium 600-vit d3', 'calcium 500 + vitamin d', 'calcidol']



In [5]:
drugs_type = []
for drugs in df_ost['Drug']:
    if drugs in injectable_drugs:
        drugs_type.append("injectable")
    elif drugs in infused_drugs:
        drugs_type.append("non_injectable")
    else:
        drugs_type.append("")

df_ost["Drug_Type"] = drugs_type

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ost["Drug_Type"] = drugs_type


# Repeat Rows as many times as usefulcount

In [6]:
print('Reviews with useful count = 0: ', np.sum(df_ost['UsefulCount'] == 0))

Reviews with useful count = 0:  135


In [7]:
df_ost = df_ost.loc[df_ost.index.repeat(df_ost.UsefulCount)]

# Breakdown of Good and Bad Reviews

In [8]:
bad_df = df_ost[df_ost['Satisfaction']<3]
bad_df.sort_values(['Age'], ascending=[True])

good_df = df_ost[df_ost['Satisfaction'] >3]
good_df.sort_values(['Age'], ascending=[True])

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount,Drug_Type
109259,,Osteoporosis,2015-05-05,prolia syringe,154218,5,5,I am very satisfied with Prolia which I have b...,5,Female,,43,injectable
109259,,Osteoporosis,2015-05-05,prolia syringe,154218,5,5,I am very satisfied with Prolia which I have b...,5,Female,,43,injectable
109259,,Osteoporosis,2015-05-05,prolia syringe,154218,5,5,I am very satisfied with Prolia which I have b...,5,Female,,43,injectable
277630,,Post-Menopausal Osteoporosis Prevention,2009-03-31,fosamax,1273,5,5,I have had no problems with this medication ho...,5,,"Stomach pain , constipation , diarrhea , gas...",8,
277630,,Post-Menopausal Osteoporosis Prevention,2009-03-31,fosamax,1273,5,5,I have had no problems with this medication ho...,5,,"Stomach pain , constipation , diarrhea , gas...",8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245291,75+,Osteoporosis,2009-05-12,actonel tablet osteoporosis agents,16383,5,5,A MALE WITH OSTEOPENIA\r\nHAVE BEEN TAKING ACT...,5,Male,Upset stomach may occur.,8,non_injectable
245291,75+,Osteoporosis,2009-05-12,actonel tablet osteoporosis agents,16383,5,5,A MALE WITH OSTEOPENIA\r\nHAVE BEEN TAKING ACT...,5,Male,Upset stomach may occur.,8,non_injectable
245291,75+,Osteoporosis,2009-05-12,actonel tablet osteoporosis agents,16383,5,5,A MALE WITH OSTEOPENIA\r\nHAVE BEEN TAKING ACT...,5,Male,Upset stomach may occur.,8,non_injectable
245291,75+,Osteoporosis,2009-05-12,actonel tablet osteoporosis agents,16383,5,5,A MALE WITH OSTEOPENIA\r\nHAVE BEEN TAKING ACT...,5,Male,Upset stomach may occur.,8,non_injectable


# Unigram

In [9]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
more_stopwords = {'!','"','#','$','%','&','(',')','*','+','-','.',',','/',':',';','<','=','>','?','@','[',']','^','_','{','|','}','~', 'went', 'go', 'one', 'two', '2', '3'}
stopwords = stopwords.union(more_stopwords)

In [10]:
import nltk

In [11]:
from collections import defaultdict
from nltk.tokenize import word_tokenize
#nltk.word_tokenize(df_ost['Reviews'].iloc[0])

tokenizer = nltk.RegexpTokenizer(r"\w+")
#new_words = tokenizer.tokenize(df_ost['Reviews'].iloc[0])

In [12]:
def remove_punc(string):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string

In [13]:
def generate_ngrams(text, n_gram=1):
    from nltk.tokenize import word_tokenize
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    #text = tokenizer.tokenize(text)

    token = [token for token in text.lower().split(" ") if token != "" if token not in stopwords]
    token = [remove_punc(i) for i in token]
    #token = [token for token in tokenizer.tokenize(text) if token != "" if token not in STOPWORDS]
    
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

In [14]:
## custom function for horizontal bar chart ##
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

In [15]:
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [16]:
## Get the bar chart from rating  8 to 10 Reviews ##
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(50), 'blue')

## Get the bar chart from rating  4 to 7 Reviews ##
freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of rating in Bad Reviews", 
                                          "Frequent words of rating in Good Reviews"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



# Bigram

In [17]:
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'orange')

freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(50), 'orange')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent biagrams of Bad Reviews", 
                                          "Frequent biagrams of Good Reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=1000, paper_bgcolor='rgb(233,233,233)', title="Bigram Count Plots")
py.iplot(fig, filename='word-plots')


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



# Trigram

In [18]:
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'green')

freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(50), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent trigrams of Bad Reviews", 
                                          "Frequent trigrams of Good Reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=1600, paper_bgcolor='rgb(233,233,233)', title="Trigram Count Plots")
py.iplot(fig, filename='word-plots')

# 4-Gram

In [19]:
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent,4):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'red')

freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent,4):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(50), 'red')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent 4-grams of Bad Reviews", 
                                          "Frequent 4-grams of Good Reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=1600, paper_bgcolor='rgb(233,233,233)', title="4-grams Count Plots")
py.iplot(fig, filename='word-plots')

# Zero-Shot Classification --- Work in Progress

In [30]:
df_ost.reset_index()

Unnamed: 0,index,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount,Drug_Type,infusion,side_effect,efficacy
0,6207,45-54,Post-Menopausal Osteoporosis Prevention,2016-12-29,lopreeza,167327,5,5,I have taken this drug for almost 7 years with...,5,Female,"Stomach upset, nausea / vomiting , bloating...",1,,0.847132,0.097637,0.055231
1,18993,65-74,Osteoporosis,2010-05-26,liquid calcium + vitamin d,93396,5,5,since I have a hard time swallowing this hs wo...,5,Female,Constipation or stomach upset may occur.,1,non_injectable,0.735927,0.235236,0.028837
2,61145,45-54,Osteoporosis,2012-01-19,oyster shell + d,94390,1,1,I have severe pain in my hand and muscle joint...,1,Female,Constipation or stomach upset may occur.,2,non_injectable,,,
3,61145,45-54,Osteoporosis,2012-01-19,oyster shell + d,94390,1,1,I have severe pain in my hand and muscle joint...,1,Female,Constipation or stomach upset may occur.,2,non_injectable,,,
4,72313,75+,Osteoporosis,2015-02-23,os-cal 500-vit d3,16527,1,3,Food dyes and talc...large pill for a newly re...,1,Female,Constipation or stomach upset may occur.,1,non_injectable,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27944,360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
27945,360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
27946,360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
27947,360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,


In [21]:
from transformers import pipeline

df_ost['infusion'] = ''
df_ost['side_effect'] = ''
df_ost['efficacy'] = ''


classifier = pipeline("zero-shot-classification")

for reviews, row in df_ost.iterrows():
    a = classifier(
        df_ost['Reviews'][reviews],
        candidate_labels=["infusion", "side effect", "efficacy", ],)
        
    a = pd.DataFrame.from_dict(a)

    df_ost['infusion'][reviews] = a.iloc[0,2]
    df_ost['side_effect'][reviews] = a.iloc[1,2]
    df_ost['efficacy'][reviews] = a.iloc[2,2]

No model was supplied, defaulted to roberta-large-mnli (https://huggingface.co/roberta-large-mnli)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the do

KeyError: 0

In [22]:
df_ost

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount,Drug_Type,infusion,side_effect,efficacy
6207,45-54,Post-Menopausal Osteoporosis Prevention,2016-12-29,lopreeza,167327,5,5,I have taken this drug for almost 7 years with...,5,Female,"Stomach upset, nausea / vomiting , bloating...",1,,0.847132,0.097637,0.055231
18993,65-74,Osteoporosis,2010-05-26,liquid calcium + vitamin d,93396,5,5,since I have a hard time swallowing this hs wo...,5,Female,Constipation or stomach upset may occur.,1,non_injectable,0.735927,0.235236,0.028837
61145,45-54,Osteoporosis,2012-01-19,oyster shell + d,94390,1,1,I have severe pain in my hand and muscle joint...,1,Female,Constipation or stomach upset may occur.,2,non_injectable,,,
61145,45-54,Osteoporosis,2012-01-19,oyster shell + d,94390,1,1,I have severe pain in my hand and muscle joint...,1,Female,Constipation or stomach upset may occur.,2,non_injectable,,,
72313,75+,Osteoporosis,2015-02-23,os-cal 500-vit d3,16527,1,3,Food dyes and talc...large pill for a newly re...,1,Female,Constipation or stomach upset may occur.,1,non_injectable,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
360371,45-54,Osteoporosis,2008-12-08,calcidol,149942,5,4,My father is using Calcidol Drops (10 drops/da...,4,Male,Vitamin D at normal doses usually has no side...,8,non_injectable,,,
