In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
df = pd.read_csv('data/webmd.csv')
df['Date'] = df['Date'].astype('datetime64[ns]')
df = df.drop_duplicates('Reviews')
df_ost = df.loc[df['Condition'].str.contains("Osteoporosis", case=False)]
df_ost.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1357 entries, 4147 to 360371
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Age            1357 non-null   object        
 1   Condition      1357 non-null   object        
 2   Date           1357 non-null   datetime64[ns]
 3   Drug           1357 non-null   object        
 4   DrugId         1357 non-null   int64         
 5   EaseofUse      1357 non-null   int64         
 6   Effectiveness  1357 non-null   int64         
 7   Reviews        1357 non-null   object        
 8   Satisfaction   1357 non-null   int64         
 9   Sex            1357 non-null   object        
 10  Sides          1357 non-null   object        
 11  UsefulCount    1357 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(6)
memory usage: 137.8+ KB


In [3]:
bin_age = []

for age in df_ost['Age']:
    if age == '7-12':
        bin_age.append('0-44')
    elif age == '13-18':
        bin_age.append('0-44')
    elif age == '19-24':
        bin_age.append('0-44')
    elif age == '25-34':
        bin_age.append('0-44')
    elif age == '35-44':
        bin_age.append('0-44')
    elif age == '45-54':
        bin_age.append('45-54')
    elif age == '55-64':
        bin_age.append('55-64')
    elif age == '65-74':
        bin_age.append('65-74')
    elif age == '75 or over':
        bin_age.append('75+')
    else:
        #Combine missing values into the most frequent bin
        bin_age.append('')

df_ost['Age'] = bin_age


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ost['Age'] = bin_age


# Breakdown of Good and Bad Reviews

In [4]:
bad_df = df_ost[df_ost['Satisfaction']<3]
bad_df.sort_values(['Age'], ascending=[True])

good_df = df_ost[df_ost['Satisfaction'] >3]
good_df.sort_values(['Age'], ascending=[True])

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
109259,,Osteoporosis,2015-05-05,prolia syringe,154218,5,5,I am very satisfied with Prolia which I have b...,5,Female,,43
277630,,Post-Menopausal Osteoporosis Prevention,2009-03-31,fosamax,1273,5,5,I have had no problems with this medication ho...,5,,"Stomach pain , constipation , diarrhea , gas...",8
147452,0-44,Osteoporosis,2010-11-22,"reclast bottle, infusion",148235,5,5,I too almost canceled my infusion after readin...,5,,"Nausea , tiredness, flu -like symptoms (e.g.,...",50
145437,0-44,Osteoporosis caused by Glucocorticoid Drugs,2015-10-26,"reclast bottle, infusion",148235,5,5,"I had this done in March 2015 It was painless,...",5,Female,"Nausea , tiredness, flu -like symptoms (e.g.,...",2
146418,0-44,Osteoporosis,2011-12-14,"reclast bottle, infusion",148235,5,4,"After being diagnosed with osteoporosis at 40,...",4,Female,"Nausea , tiredness, flu -like symptoms (e.g.,...",27
...,...,...,...,...,...,...,...,...,...,...,...,...
244651,75+,Osteoporosis,2012-08-21,actonel tablet osteoporosis agents,16383,4,5,"Male now 80, zero testosterone from(orchiectom...",5,Male,Upset stomach may occur.,8
244736,75+,Osteoporosis,2011-03-23,actonel tablet osteoporosis agents,16383,4,3,I began taking Foxamax ten years ago. When I w...,4,Female,Upset stomach may occur.,16
147557,75+,Osteoporosis,2010-06-30,"reclast bottle, infusion",148235,5,5,Had difficulty with the pills...mostly having ...,5,Female,"Nausea , tiredness, flu -like symptoms (e.g.,...",27
145627,75+,Osteoporosis,2015-01-17,"reclast bottle, infusion",148235,5,5,I am 78 years old and received this infusion 9...,5,Female,"Nausea , tiredness, flu -like symptoms (e.g.,...",33


# Unigram

In [5]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
more_stopwords = {'!','"','#','$','%','&','(',')','*','+','-','.',',','/',':',';','<','=','>','?','@','[',']','^','_','{','|','}','~', 'went', 'go', 'one', 'two', '2', '3'}
stopwords = stopwords.union(more_stopwords)

In [6]:
import nltk

In [7]:
from collections import defaultdict
from nltk.tokenize import word_tokenize
#nltk.word_tokenize(df_ost['Reviews'].iloc[0])

tokenizer = nltk.RegexpTokenizer(r"\w+")
#new_words = tokenizer.tokenize(df_ost['Reviews'].iloc[0])

In [8]:
def remove_punc(string):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string

In [9]:
def generate_ngrams(text, n_gram=1):
    from nltk.tokenize import word_tokenize
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    #text = tokenizer.tokenize(text)

    token = [token for token in text.lower().split(" ") if token != "" if token not in stopwords]
    token = [remove_punc(i) for i in token]
    #token = [token for token in tokenizer.tokenize(text) if token != "" if token not in STOPWORDS]
    
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

In [10]:
## custom function for horizontal bar chart ##
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

In [11]:
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [12]:
## Get the bar chart from rating  8 to 10 Reviews ##
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(50), 'blue')

## Get the bar chart from rating  4 to 7 Reviews ##
freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of rating in Bad Reviews", 
                                          "Frequent words of rating in Good Reviews"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



# Bigram

In [13]:
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'orange')

freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(50), 'orange')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent biagrams of Bad Reviews", 
                                          "Frequent biagrams of Good Reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=1000, paper_bgcolor='rgb(233,233,233)', title="Bigram Count Plots")
py.iplot(fig, filename='word-plots')


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



# Trigram

In [14]:
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'green')

freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(50), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent trigrams of Bad Reviews", 
                                          "Frequent trigrams of Good Reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=1600, paper_bgcolor='rgb(233,233,233)', title="Trigram Count Plots")
py.iplot(fig, filename='word-plots')

# 4-Gram

In [15]:
freq_dict = defaultdict(int)
for sent in bad_df["Reviews"]:
    for word in generate_ngrams(sent,4):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'red')

freq_dict = defaultdict(int)
for sent in good_df["Reviews"]:
    for word in generate_ngrams(sent,4):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(50), 'red')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent 4-grams of Bad Reviews", 
                                          "Frequent 4-grams of Good Reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=1600, paper_bgcolor='rgb(233,233,233)', title="4-grams Count Plots")
py.iplot(fig, filename='word-plots')

# Zero-Shot Classification --- Work in Progress

In [17]:
from transformers import pipeline

df_ost['cost'] = ''
df_ost['side_effect'] = ''
df_ost['efficacy'] = ''


classifier = pipeline("zero-shot-classification")

for reviews, row in df_ost.iterrows():
    a = classifier(
        df_ost['Reviews'][reviews],
        candidate_labels=["cost", "side effect", "efficacy", ],)
        
    a = pd.DataFrame.from_dict(a)

    df_ost['cost'][reviews] = a.iloc[0,2]
    df_ost['side_effect'][reviews] = a.iloc[1,2]
    df_ost['efficacy'][reviews] = a.iloc[2,2]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

No model was supplied, defaulted to roberta-large-mnli (https://huggingface.co/roberta-large-mnli)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the la

KeyboardInterrupt: 