In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig


In [6]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [8]:
#import data
pharm_data = pd.read_csv('pharmacy_dataset_reduced.csv')
data = pharm_data.copy()
data.head()
#len(data)

Unnamed: 0,Website,Body,Header,Footer,Image Urls,Accredited,Complete_html,Zipcode,Valid_phone
0,https://acariahealth.envolvehealth.com,Skip to Main Content Home Contact Insights Sea...,Home Contact Insights Search Search About Over...,Locations Referral Forms Careers Disclaimer HI...,['https://acariahealth.envolvehealth.com/conte...,0,Skip to Main Content Home Contact Insights Sea...,0,1
1,https://alto.com,Shop Alto Essentials to add pharmacy products ...,Shop Alto Essentials to add pharmacy products ...,Our Story Careers Drive for Alto For Providers...,['https://images.prismic.io/alto/176341aa-258d...,0,Shop Alto Essentials to add pharmacy products ...,0,1
2,https://www.amberpharmacy.com,Loading... Skip to navigation Skip to main con...,,About Careers 340B Hospital Program Locations ...,['https://www.amberpharmacy.com/wp-content/the...,0,,0,0
3,https://www.aoncology.com,For Patients Careers Blog Practice Success Pat...,For Patients Careers Blog Practice Success Pat...,"© 2020 American Oncology Network, LLC. All Rig...",['https://dc.ads.linkedin.com/collect/?pid=944...,0,For Patients Careers Blog Practice Success Pat...,1,0
4,https://www.birdirx.com,Loading... × Close alert Fly with Birdi! Free...,,Support Hours of Operation Monday through Frid...,"[""https://www.birdirx.comdata:image/svg+xml;ch...",0,,0,0


### Data Cleaning

In [12]:
#combine columns into new column containing all text
data['text'] = data['Body'].astype(str) + data['Header'].astype(str) + data['Footer'].astype(str)
data.head()

Unnamed: 0,Website,Body,Header,Footer,Image Urls,Accredited,Complete_html,Zipcode,Valid_phone,text
0,https://acariahealth.envolvehealth.com,Skip to Main Content Home Contact Insights Sea...,Home Contact Insights Search Search About Over...,Locations Referral Forms Careers Disclaimer HI...,['https://acariahealth.envolvehealth.com/conte...,0,Skip to Main Content Home Contact Insights Sea...,0,1,Skip to Main Content Home Contact Insights Sea...
1,https://alto.com,Shop Alto Essentials to add pharmacy products ...,Shop Alto Essentials to add pharmacy products ...,Our Story Careers Drive for Alto For Providers...,['https://images.prismic.io/alto/176341aa-258d...,0,Shop Alto Essentials to add pharmacy products ...,0,1,Shop Alto Essentials to add pharmacy products ...
2,https://www.amberpharmacy.com,Loading... Skip to navigation Skip to main con...,,About Careers 340B Hospital Program Locations ...,['https://www.amberpharmacy.com/wp-content/the...,0,,0,0,Loading... Skip to navigation Skip to main con...
3,https://www.aoncology.com,For Patients Careers Blog Practice Success Pat...,For Patients Careers Blog Practice Success Pat...,"© 2020 American Oncology Network, LLC. All Rig...",['https://dc.ads.linkedin.com/collect/?pid=944...,0,For Patients Careers Blog Practice Success Pat...,1,0,For Patients Careers Blog Practice Success Pat...
4,https://www.birdirx.com,Loading... × Close alert Fly with Birdi! Free...,,Support Hours of Operation Monday through Frid...,"[""https://www.birdirx.comdata:image/svg+xml;ch...",0,,0,0,Loading... × Close alert Fly with Birdi! Free...


In [13]:
#rename Accredited as 'label'
data=data.rename(columns = {'Accredited': 'label'}, inplace = False)
data.head()

Unnamed: 0,Website,Body,Header,Footer,Image Urls,label,Complete_html,Zipcode,Valid_phone,text
0,https://acariahealth.envolvehealth.com,Skip to Main Content Home Contact Insights Sea...,Home Contact Insights Search Search About Over...,Locations Referral Forms Careers Disclaimer HI...,['https://acariahealth.envolvehealth.com/conte...,0,Skip to Main Content Home Contact Insights Sea...,0,1,Skip to Main Content Home Contact Insights Sea...
1,https://alto.com,Shop Alto Essentials to add pharmacy products ...,Shop Alto Essentials to add pharmacy products ...,Our Story Careers Drive for Alto For Providers...,['https://images.prismic.io/alto/176341aa-258d...,0,Shop Alto Essentials to add pharmacy products ...,0,1,Shop Alto Essentials to add pharmacy products ...
2,https://www.amberpharmacy.com,Loading... Skip to navigation Skip to main con...,,About Careers 340B Hospital Program Locations ...,['https://www.amberpharmacy.com/wp-content/the...,0,,0,0,Loading... Skip to navigation Skip to main con...
3,https://www.aoncology.com,For Patients Careers Blog Practice Success Pat...,For Patients Careers Blog Practice Success Pat...,"© 2020 American Oncology Network, LLC. All Rig...",['https://dc.ads.linkedin.com/collect/?pid=944...,0,For Patients Careers Blog Practice Success Pat...,1,0,For Patients Careers Blog Practice Success Pat...
4,https://www.birdirx.com,Loading... × Close alert Fly with Birdi! Free...,,Support Hours of Operation Monday through Frid...,"[""https://www.birdirx.comdata:image/svg+xml;ch...",0,,0,0,Loading... × Close alert Fly with Birdi! Free...


In [14]:
#reduce dataset to only the columns we need, 'label' and 'text'
data = data.drop(['Website', 'Body', 'Header', 'Footer', 'Image Urls', 'Complete_html', 'Zipcode', 'Valid_phone'], axis=1)

In [15]:
#Sentences contain the entire text data and labels contain all the corresponding labels
data.head()

Unnamed: 0,label,text
0,0,Skip to Main Content Home Contact Insights Sea...
1,0,Shop Alto Essentials to add pharmacy products ...
2,0,Loading... Skip to navigation Skip to main con...
3,0,For Patients Careers Blog Practice Success Pat...
4,0,Loading... × Close alert Fly with Birdi! Free...


In [16]:
len(data)

72

In [17]:
#drop any missing values
data = data.dropna()

In [18]:
len(data)

72

In [20]:
#reset index
data=data.reset_index(drop=True) 
#shuffle data
data = shuffle(data)   
data.head()

Unnamed: 0,label,text
62,1,Log In Register Checkout Languages: Currencies...
44,1,www.Drugscom 2015. All Rights Reserved.nannan
24,0,Skip to the main content. Senderra Institute P...
52,1,Skip to content Menu ทางเข้า sbobet แทงบอลสบาย...
6,0,Skip to main content Order Status Check Drug C...


In [22]:
#apply the function to clean the text - all lower case, strip spaces and characters 
data['text']=data['text'].map(preprocess_sentence)    
data.head()

Unnamed: 0,label,text
62,1,log register checkout languages currencies dol...
44,1,www drugscom rights reserved nannan
24,0,skip main content senderra institute prescribe...
52,1,skip content menu sbobet blog search search su...
6,0,skip main content order status check drug cost...


### Setting up a pre-trained BERT model for fine-tuning
#### Load BERT tokenizer and BERT model

In [23]:
#define the number of unique classes under 'label'
num_classes=len(data.label.unique())
num_classes

2

In [25]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                             num_labels=num_classes)

loading file vocab.txt from cache at C:\Users\haley/.cache\huggingface\hub\models--bert-base-uncased\snapshots\5546055f03398095e385d7dc625e636cc8910bf2\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\haley/.cache\huggingface\hub\models--bert-base-uncased\snapshots\5546055f03398095e385d7dc625e636cc8910bf2\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\haley/.cache\huggingface\hub\models--bert-base-uncased\snapshots\5546055f03398095e385d7dc625e636cc8910bf2\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
loading weights file tf_model.h5 from cache at C:\Users\haley/.cache\huggingface\hub\models--bert-base-uncased\snapshots\5546055f03398095e385d7dc625e636cc8910bf2\tf_model.h5
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
test_sent = "this is a test for our model."
tokenized_sequence= bert_tokenizer.encode_plus(test_sent,
                                               add_special_tokens = True, 
                                               pad_to_max_length = True, 
                                               return_attention_mask = True)

In [29]:
#tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly
tokenized_sequence

{'input_ids': [101, 2023, 2003, 1037, 3231, 2005, 2256, 2944, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
#Decode sentence
bert_tokenizer.decode(tokenized_sequence['input_ids'])

'[CLS] this is a test for our model. [SEP]'

In [35]:
#turn text and label column into variabels for tokenizer 
text = data['text']
labels = data['label']
len(text), len(labels) #make sure len is the same

(72, 72)

In [55]:
labels.dtype

dtype('int64')

In [56]:
text.dtype

dtype('O')

In [63]:
text = text.astype(str)

In [64]:
text.head()

62    log register checkout languages currencies dol...
44                  www drugscom rights reserved nannan
24    skip main content senderra institute prescribe...
52    skip content menu sbobet blog search search su...
6     skip main content order status check drug cost...
Name: text, dtype: object

#### Encoding of the text data using BERT Tokenizer and obtaining the input_ids and attentions masks to feed into the model

In [91]:
#Load the sentences into the BERT Tokenizer
#BERT Tokenizer returns a dictionary - get input ds and the attention masks
#Convert encoding to NumPy arrays
input_ids = []
attention_masks = []

for sent in text:
    #print(sent)
    bert_inp = bert_tokenizer.encode_plus(sent, return_attention_mask = True)
    #print(bert_inp)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])
    #break
#input_ids.astype('object')
input_ids = np.array(input_ids, dtype=object)
attention_masks = np.array(attention_masks, dtype=object)
labels = np.array(labels)

In [93]:
input_ids.dtype

dtype('O')

In [86]:
len(input_ids), len(attention_masks), len(labels) #make sure they're all the same length

(72, 72, 72)

### Split Data - Training & Validation
- (80-20 split)

In [87]:
#Split into the random train and validation subsets
train_inp, val_inp, train_label, val_label, train_mask, val_mask = train_test_split(input_ids,
                                                                                    labels,
                                                                                    attention_masks,
                                                                                    test_size=0.2)

len(train_inp), len(val_inp), len(train_label), len(val_label), len(train_mask), len(val_mask)

(57, 15, 57, 15, 57, 15)

### Loss, Metric, Optmizer

In [88]:
#a 'callback' can perform actions at various stages of training - write logs, save model to disk
    #early stopping, view of internal state/stats during training
'''
log_dir='tensorboard_data/tb_bert'
model_save_path='./models/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                                save_weights_only=True,
                                                monitor='val_loss',
                                                mode='min',
                                                save_best_only=True),
             keras.callbacks.TensorBoard(log_dir=log_dir)]

print('\nBert Model', bert_model.summary())
'''
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

### Train Model


In [98]:
valid_dataset = ([val_inp, val_mask], val_label)

In [99]:
train_model = bert_model.fit([train_inp, train_mask],
                             train_label,
                             batch_size=32,
                             epochs=4,
                             validation_data=valid_dataset,
                             callbacks=callbacks)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).