In [2]:
# ! pip install transformers
# ! pip install datasets

In [12]:
import numpy as np
import pandas as pd

## Fine-tune BART on WebMD (Osteoporosis specifically)


### Data Overview

In [108]:
df = pd.read_csv("osteoporosis-reviews.csv")#load labeled data
df = df.dropna()

In [109]:
df = df[:100]
df["side_effect"] = df["side_effect"].astype(int)
df["effectiveness"] = df["effectiveness"].astype(int)
df

Unnamed: 0,Review,labels,side_effect,effectiveness
0,this treatment is awsome. I love drugs.,"[0,0]",0,0
1,After taking this drug for approx. 21 days I s...,"[1,0]",1,0
2,I have taken this drug for almost 7 years with...,"[1,1]",1,1
3,since I have a hard time swallowing this hs wo...,"[0,1]",0,1
4,I have severe pain in my hand and muscle joint...,"[1,0]",1,0
...,...,...,...,...
96,I want to thank everyone here that rated this ...,"[1,1]",1,1
97,"5th injection, no side effects,now normal bone...","[1,1]",1,1
98,This is the worse medication my husband has ev...,"[1,0]",1,0
99,Call FDA to report negative side effects 1 888...,"[1,0]",1,0


In [110]:
#train/val/test split
df_train = df.iloc[:60]
df_val = df.iloc[60:80]
df_test = df.iloc[80:]

df_train.to_csv("train.csv", index=False)
df_val.to_csv("val.csv", index=False)
df_test.to_csv("test.csv", index=False)

### Create Data Loader

In [111]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "train.csv", "val": "val.csv", "test": "test.csv"})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-39e8e50c1fad4edb/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-39e8e50c1fad4edb/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [112]:
#an example of reviews
dataset["train"]

Dataset({
    features: ['Review', 'labels', 'side_effect', 'effectiveness'],
    num_rows: 60
})

### Load Tokenizer 

Process the reviews and include a padding and truncation strategy to handle any variable sequence length.

In [86]:
from transformers import BartTokenizer

labels = ["side_effect", "effectiveness"]
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Review"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

loading file https://huggingface.co/facebook/bart-large/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/0d6fc8b2ef1860c1f8f0baff4b021e3426cc7d11b153f98e563b799603ee2f25.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/facebook/bart-large/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/6e75e35f0bdd15870c98387e13b93a8e100237eb33ad99c36277a0562bd6d850.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/facebook/bart-large/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/facebook/bart-large/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/facebook/bart-large/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/1abf196c889c24daca2909359ca2090e5fcbfa21a9ea36d763f70adbafb500d7.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a

In [87]:
#encode dataset
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [98]:
example = encoded_dataset['train'][3]
print(example.keys())

dict_keys(['labels', 'input_ids', 'attention_mask'])


In [99]:
tokenizer.decode(example['input_ids'])

'<s>since I have a hard time swallowing this hs worked for me</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [100]:
example["labels"]

tensor([0., 1.])

In [101]:
encoded_dataset.set_format("torch")#set the format of our data as Pytorch tensor

### Load and Train the Model

Discard the head of the BART model and replace with a randomly initialized classification head. Fine-tune this new model head on my sequence classification task, transferring the knowledge of the pre-trained model to it.

In [114]:
from transformers import BartForSequenceClassification

#define the model
model = BartForSequenceClassification.from_pretrained("facebook/bart-large", problem_type="multi_label_classification", num_labels=2)

loading configuration file https://huggingface.co/facebook/bart-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3f12fb71b844fcb7d591fdd4e55027da90d7b5dd6aa5430ad00ec6d76585f26c.bc22f15dc7ba074ee0a60bdd34c5f2fe3b6d746f89e765303376c51aff04e260
Model config BartConfig {
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false

In [115]:
#declare training hyperparameters
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="Results/output",
                                learning_rate=1e-5,
                                evaluation_strategy="epoch",
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=8,
                                num_train_epochs=50,
                                weight_decay=0.01,
                                metric_for_best_model="f1",
                                # save_strategy="epoch",
                                # load_best_model_at_end=True

)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [117]:
#define a compute_metrics function that returns a dictionary with the desired metric values
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))
  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs >= threshold)] = 1
  y_true = labels
  f1_micro_average = f1_score(y_true, y_pred,average="micro") 
  roc_auc = roc_auc_score(y_true, y_pred, average="micro") 
  accuracy = accuracy_score(y_true, y_pred)

  metrics = {"f1": f1_micro_average,
             "roc_auc": roc_auc,
             "accuracy": accuracy}
  return metrics

def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  result = multi_label_metrics(predictions=preds, labels=p.label_ids)
  return result

In [118]:
#trainer
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = encoded_dataset["train"],
    eval_dataset = encoded_dataset["val"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [119]:
#fine-tune BART model by calling train()
trainer.train()

***** Running training *****
  Num examples = 60
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 400


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.457087,0.77551,0.782132,0.5
2,No log,0.431511,0.851852,0.805643,0.6
3,No log,0.426629,0.881356,0.766458,0.65
4,No log,0.416391,0.888889,0.868339,0.7
5,No log,0.40933,0.857143,0.777429,0.6
6,No log,0.380491,0.862069,0.749216,0.6
7,No log,0.383259,0.888889,0.868339,0.7
8,No log,0.360529,0.877193,0.794671,0.65
9,No log,0.384077,0.851852,0.805643,0.6
10,No log,0.394162,0.892857,0.840125,0.7


***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=400, training_loss=0.07535930633544922, metrics={'train_runtime': 363.4636, 'train_samples_per_second': 8.254, 'train_steps_per_second': 1.101, 'total_flos': 815087227392000.0, 'train_loss': 0.07535930633544922, 'epoch': 50.0})

### Evaluation

In [120]:
#evaluate on val set
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 20
  Batch size = 8


{'epoch': 50.0,
 'eval_accuracy': 0.6,
 'eval_f1': 0.851851851851852,
 'eval_loss': 0.9764012098312378,
 'eval_roc_auc': 0.8056426332288401,
 'eval_runtime': 0.707,
 'eval_samples_per_second': 28.287,
 'eval_steps_per_second': 4.243}

### Inference

In [148]:
#evaluate the model performance on test set
num = 0
for i in range(len(dataset["test"])):
  text = dataset["test"][i]["Review"]
  side_effect = dataset["test"][i]["side_effect"]
  effectiveness = dataset["test"][i]["effectiveness"]

  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
  outputs = trainer.model(**encoding)

  #apply sigmoid and threshold
  sigmoid = torch.nn.Sigmoid()
  logits = outputs.logits
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >= 0.5)] = 1
  predictions = predictions.astype(int)
  
  if predictions[0] == side_effect and predictions[1] == effectiveness:
    num += 1
  
print("The accuracy on test set is: {:.0%}".format(num/len(dataset["test"])))

The accuracy on test set is: 80%


### Save the model

In [149]:
trainer.save_model("path/to/model")#save the model

Saving model checkpoint to path/to/model
Configuration saved in path/to/model/config.json
Model weights saved in path/to/model/pytorch_model.bin
tokenizer config file saved in path/to/model/tokenizer_config.json
Special tokens file saved in path/to/model/special_tokens_map.json


In [None]:
#reload the model
# model = BartForSequenceClassification.from_pretrained("path/to/model", problem_type="multi_label_classification", num_labels=2)


### Review Classification on Original Data

In [162]:
#load original data
df = pd.read_csv('webmd.csv')
df['Date'] = df['Date'].astype('datetime64[ns]')
df = df.drop_duplicates('Reviews')
df = df[df['Condition'].str.contains("osteoporosis", case = False)]
df

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
4147,25-34,Osteoporosis,2010-03-16,lotensin,6883,4,3,this treatment is awsome. I love drugs.,2,Male,"Dizziness , lightheadedness , drowsiness, or ...",0
6192,45-54,Post-Menopausal Osteoporosis Prevention,2017-07-31,lopreeza,167327,5,2,After taking this drug for approx. 21 days I s...,3,Male,"Stomach upset, nausea / vomiting , bloating...",0
6207,45-54,Post-Menopausal Osteoporosis Prevention,2016-12-29,lopreeza,167327,5,5,I have taken this drug for almost 7 years with...,5,Female,"Stomach upset, nausea / vomiting , bloating...",1
18993,65-74,Osteoporosis,2010-05-26,liquid calcium + vitamin d,93396,5,5,since I have a hard time swallowing this hs wo...,5,Female,Constipation or stomach upset may occur.,1
61145,45-54,Osteoporosis,2012-01-19,oyster shell + d,94390,1,1,I have severe pain in my hand and muscle joint...,1,Female,Constipation or stomach upset may occur.,2
...,...,...,...,...,...,...,...,...,...,...,...,...
360231,55-64,Osteoporosis,2010-11-15,calcium 600 + vitamin d,93224,4,5,The only thing I have to say against these tab...,4,Female,Constipation or stomach upset may occur.,11
360237,55-64,Osteoporosis,2016-03-16,calcium 500 + vitamin d,151589,4,4,Just started taking this a month ago. Definite...,4,,Constipation or stomach upset may occur.,1
360264,55-64,Osteoporosis,2012-03-08,calcitrate + vit d,19844,3,3,"This supplement has caused much gas, bloating,...",1,Female,Constipation or stomach upset may occur.,2
360298,65-74,Osteoporosis,2012-07-24,calcitrate,22129,1,1,first dose: one tablet knocked me out cold for...,1,Female,Constipation and upset stomach may occur.,1


In [166]:
df = df.reset_index().drop(columns=["index"])
df

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,25-34,Osteoporosis,2010-03-16,lotensin,6883,4,3,this treatment is awsome. I love drugs.,2,Male,"Dizziness , lightheadedness , drowsiness, or ...",0
1,45-54,Post-Menopausal Osteoporosis Prevention,2017-07-31,lopreeza,167327,5,2,After taking this drug for approx. 21 days I s...,3,Male,"Stomach upset, nausea / vomiting , bloating...",0
2,45-54,Post-Menopausal Osteoporosis Prevention,2016-12-29,lopreeza,167327,5,5,I have taken this drug for almost 7 years with...,5,Female,"Stomach upset, nausea / vomiting , bloating...",1
3,65-74,Osteoporosis,2010-05-26,liquid calcium + vitamin d,93396,5,5,since I have a hard time swallowing this hs wo...,5,Female,Constipation or stomach upset may occur.,1
4,45-54,Osteoporosis,2012-01-19,oyster shell + d,94390,1,1,I have severe pain in my hand and muscle joint...,1,Female,Constipation or stomach upset may occur.,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1352,55-64,Osteoporosis,2010-11-15,calcium 600 + vitamin d,93224,4,5,The only thing I have to say against these tab...,4,Female,Constipation or stomach upset may occur.,11
1353,55-64,Osteoporosis,2016-03-16,calcium 500 + vitamin d,151589,4,4,Just started taking this a month ago. Definite...,4,,Constipation or stomach upset may occur.,1
1354,55-64,Osteoporosis,2012-03-08,calcitrate + vit d,19844,3,3,"This supplement has caused much gas, bloating,...",1,Female,Constipation or stomach upset may occur.,2
1355,65-74,Osteoporosis,2012-07-24,calcitrate,22129,1,1,first dose: one tablet knocked me out cold for...,1,Female,Constipation and upset stomach may occur.,1


In [172]:
side_effect = []
effectiveness = []
for i in range(len(df["Reviews"])):
  text = df["Reviews"][i]
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
  outputs = trainer.model(**encoding)

  #apply sigmoid and threshold
  sigmoid = torch.nn.Sigmoid()
  logits = outputs.logits
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >= 0.5)] = 1
  predictions = predictions.astype(int)
  side_effect.append(predictions[0])
  effectiveness.append(predictions[1])

In [177]:
df["side_effect"] = side_effect
df["effectiveness"] = effectiveness

In [179]:
df.to_csv("preprocessed_osteoporosis.csv", index=False)