In [None]:
#%pip install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import transformers
from transformers import pipeline

In [None]:
df = pd.read_csv('data/updated_df.csv')
df
### From Other Notebook
df['Date'] = df['Date'].astype('datetime64[ns]')
df = df.drop_duplicates('Reviews')
# All Osteoporosis Illnesses in Female Patients
df = df[df['Condition'].str.contains("osteoporosis", case = False)]

In [None]:
classifier  = pipeline("zero-shot-classification",  model = "facebook/bart-large-mnli")

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
sample_review = "This osteoporosis medicine worked, but it gave me a nasty stomach ache. It was easy to swallow."

res = classifier(
    sample_review,
    candidate_labels = ['menstruation', 'digestive', 'joint pain', 'respiratory' , 'heart', 'skin'], 
    multi_class = True
)
res

The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.


{'sequence': 'This osteoporosis medicine worked, but it gave me a nasty stomach ache. It was easy to swallow.',
 'labels': ['digestive',
  'joint pain',
  'skin',
  'respiratory',
  'menstruation',
  'heart'],
 'scores': [0.27356016635894775,
  0.0006058795261196792,
  0.0001192762065329589,
  7.208560418803245e-05,
  5.108026016387157e-05,
  4.157235161983408e-05]}

# Expermenting with the Model

In [None]:
#A sample review that contains a) whether the medicine worked b) the type of side effect and c) the delivery method (pill)

sample_review = "This osteoporosis medicine worked, but it gave me a nasty stomach ache. It was easy to swallow."


###  Categories for types of Side-Effects: 

Definite: Disgestive (most of the side effects listed are disgestive), Respiratory (some of the reviews mention a cough)

Still tweaking with "pain", perhaps "joint pain" would work better as it doesn't include a stomach ache for example, but then you are missing out on headaches. 

"heart", "skin" and "menstruation" important but don't show up in the data as much 

In [None]:
res = classifier(
    sample_review,
    candidate_labels = ['medicine side effects', 'medicine effectiveness'], 
    multi_class = True
)
res

The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.


{'sequence': 'This osteoporosis medicine worked, but it gave me a nasty stomach ache. It was easy to swallow.',
 'labels': ['medicine side effects', 'medicine effectiveness'],
 'scores': [0.9826341867446899, 0.6695783138275146]}

In [None]:
res = classifier(
    sample_review,
    candidate_labels = ['pill', 'injection', 'patch'], 
    multi_class = True
)
res

# Adding Classification to the DataFrame

Values for the categories. A first run until we get some output and mine for some more words

In [None]:
#sides = ['menstruation', 'digestive', 'joint pain', 'respiratory' , 'heart', 'skin']
#sides_min = ['disgestive', 'joint pain', 'respiratory']
sides = ['menstruation', 'gastrointestinal problems', 'joint pain', 'respiratory issues' , 'cardiac problems', 'dermatology issues']
sides_min = ['pain', 'sickness', 'death', 'discomfort']
method = ['pill', 'injection', 'patch']
general = ['medicine side effects', 'medicine effectiveness']

Multi-class zero-shot of Different Side Effects

In [None]:
#setting empty values for the columns
index = -1
for label in sides:
    df[label + '_m'] = -1

for j in range(len(df)):
    #counter for progress/debugging
    index+=1
    if(index%50 == 0): 
        print(index)
    #running the classifier on the column    
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = sides,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_m'].iloc[j] = res['scores'][i]

In [None]:
#setting empty values for the columns
index = -1
for label in sides_min:
    df[label + '_m'] = -1

for j in range(len(df)):
    #counter for progress/debugging
    index+=1
    if(index%50 == 0): 
        print(index)
    #running the classifier on the column    
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = sides_min,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_m'].iloc[j] = res['scores'][i]

In [None]:
df.to_csv('data/updated_df.csv')

Multi-class zero-shot of General Topic

In [None]:
index = -1
for label in general:
    df[label + '_m'] = -1
for j in range(len(df)):
    index+=1
    if(index%50 == 0): 
        print(index)    
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = general,
        multi_label = True
    )

    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_m'].iloc[j] = res['scores'][i]

In [None]:
df.to_csv('data/updated_df.csv')

Multi-class zero-shot of Different Delivery Methods

In [None]:
index = -1
for label in method:
    df[label + '_m'] = -1
for j in range(len(df)):
    index+=1
    if(index%50 == 0): 
        print(index)    
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = method,
        multi_label = True
    )

    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_m'].iloc[j] = res['scores'][i]

In [None]:
df.to_csv('data/updated_df.csv')

Single-class zero-shot of Side Effects

In [None]:
index = -1
for j in range(len(df)):
    index+=1
    if(index%50 == 0): 
        print(index)    
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = sides,
        multi_label = True
    )

    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_s'].iloc[j] = res['scores'][i]

In [None]:
df.to_csv('data/updated_df.csv')

Single-class zero-shot of General Topic

In [None]:
index = -1
for j in range(len(df)):
    index+=1
    if(index%50 == 0): 
        print(index)    
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = general,
        multi_label = True
    )

    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_s'].iloc[j] = res['scores'][i]

In [None]:
df.to_csv('data/updated_df.csv')

Multi-class zero-shot of Different Delivery Methods

In [None]:
index = -1
for j in range(len(df)):
    index+=1
    if(index%50 == 0): 
        print(index)   
    res = classifier(
        df.iloc[j]['Reviews'],
        candidate_labels = method,
        multi_label = True
    )

    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_s'].iloc[j] = res['scores'][i]

In [None]:
df.to_csv('data/updated_df.csv')

# Updated Visuals


In [None]:
vis_df = pd.read_csv('data/updated_df.csv')

In [None]:
import seaborn as sn
bin_size = 20

In [None]:

bins = list(range(0,bin_size))
hbins = [item/len(bins) for item in bins]
for topic in sides:
    plt.title('Distribution of Zero Shot Scores for ' + topic)
    sns.histplot(x = topic+"_m", data = vis_df, bins = hbins)
    plt.show()

In [None]:

bins = list(range(0,bin_size))
hbins = [item/len(bins) for item in bins]
for topic in sides_min:
    plt.title('Distribution of Zero Shot Scores for ' + topic)
    sns.histplot(x = topic+"_m", data = vis_df, bins = hbins)
    plt.show()

In [None]:

bins = list(range(0,bin_size))
hbins = [item/len(bins) for item in bins]
for topic in method:
    plt.title('Distribution of Zero Shot Scores for ' + topic)
    sns.histplot(x = topic+"_m", data = vis_df, bins = hbins)
    plt.show()

In [None]:
sides_m = [side + "_m" for side in sides]
sides_min_m = [side + "_m" for side in sides_min]

gen_m = [gen + "_m" for gen in general]
met_m = [met + "_m" for met in method]

sn.heatmap(vis_df[sides_m].corr().apply(abs), cmap="YlGnBu", annot=True)
plt.show()
sn.heatmap(vis_df[gen_m].corr().apply(abs), cmap="YlGnBu", annot=True)
plt.show()
sn.heatmap(vis_df[sides_min_m].corr().apply(abs), cmap="YlGnBu", annot=True)
plt.show()
sn.heatmap(vis_df[met_m].corr().apply(abs), cmap="YlGnBu", annot=True)


In [None]:
use_df = vis_df
column = "digestive"
bin_size = 20

#distribution of the column in a binned histogram

bins = list(range(0,bin_size))
hbins = [item/len(bins) for item in bins]

plt.title('Distribution of Zero Shot Scores for ' + column)
sns.histplot(x = column+"_m", data = vis_df, bins = hbins)
plt.show()

#distribution of the column in a binned histogram for good reviews/bad reviews

vis_df[]

bins = list(range(0,bin_size))
hbins = [item/len(bins) for item in bins]

plt.title('Distribution of Zero Shot Scores for ' + column + "for bad reviews")
sns.histplot(x = column+"_m", data = bad_df, bins = hbins)
plt.show()

bins = list(range(0,bin_size))
hbins = [item/len(bins) for item in bins]

plt.title('Distribution of Zero Shot Scores for ' + column + "for good reviews")
sns.histplot(x = column+"_m", data = good_df, bins = hbins)
plt.show()


#top n-grams associated with the reviews in this category/weighted by this category

In [None]:
for topic in method:
    plt.title('Distribution of Zero Shot Scores for ' + topic)
    sns.histplot(x = topic+"_m", data = vis_df, bins = hbins)
    plt.show()

In [None]:
for el in vis_df.sort_values(by= "heart_m",ascending = False)['Reviews']:
    #print(el)
    res = classifier(
        el,
        candidate_labels = nsides,
        multi_label = True
    )
    print(res)

    

In [None]:
vis_df['heart_m'].corr(vis_df['injection_m'])

In [None]:
['menstruation', 'digestive', 'joint pain', 'respiratory' , 'heart', 'skin']
nsides = ['menstruation', 'gastrointestinal problems', 'joint pain', 'respiratory issues' , 'cardiac problems', 'dermatology issues']

#nsides = copy(sides).remove('heart')
txt = "I had my first reclast injection on may 3rd 2010 and several hours after coming home,I started having severe pain throughout my whole body.The pain was so bad,I couldn't tolerate to even move.The next morning,May 4th.I started feeling a little better and on may 5th,I was doing fine.Yesterday,May 8th.,The pain had returned and as I sit here typing this on Mother's day.Iam in so much pain I can hardly type.I regret ever doing this Reclast injection.I would never recommend it to anyone"
print(classifier(
    txt,
        candidate_labels = sides,
        multi_label = True
))
txt = "I had my first reclast injection on may 3rd 2010 and several hours after coming home,I started having severe pain throughout my whole body.The pain was so bad,I couldn't tolerate to even move.The next morning,May 4th.I started feeling a little better and on may 5th,I was doing fine.Yesterday,May 8th.,The pain had returned and as I sit here typing this on Mother's day.Iam in so much pain I can hardly type.I regret ever doing this Reclast injection.I would never recommend it to anyone"
classifier(
    txt,
        candidate_labels = nsides,
        multi_label = True
)

In [None]:
txt = "skin rash"
classifier(
    txt,
        candidate_labels = nsides,
        multi_label = True
)

# Old Visuals

The code takes a while to run (and I have yet to get a good run with these updated categories), but I have some data from a couple of runs earlier that I can use to display some of the results. 

In [None]:
labeled_df = pd.read_csv('data/preproc_with_labels.csv')

In [None]:
old_sides = ['respiratory', 'pain', 'disgestive', 'bone', 'menstruation']
labeled_df['category_s'] = ' '
for i in range(len(labeled_df)):
    row = labeled_df.iloc[i]
    values = list(row[['respiratory', 'pain', 'disgestive', 'bone', 'menstruation']])
    labeled_df['category_s'].iloc[i] = old_sides[values.index(max(values))]

In [None]:
sns.histplot(labeled_df['category_s'].value_counts())
plt.show()

In [None]:
bins = list(range(0,20))
hbins = [item/len(bins) for item in bins]
for topic in old_sides:
    plt.title('Distribution of Zero Shot Scores for ' + topic)
    sns.histplot(x = topic+"_m", data = labeled_df, bins = hbins)
    plt.show()