In [0]:
!pip install fast-bert

Collecting fast-bert
  Downloading https://files.pythonhosted.org/packages/68/34/1ba59457a620ebd526059ca2ab46b1025b49fe78f7477efbd0920b301f9a/fast_bert-0.1.2-py3-none-any.whl
Collecting pytorch-pretrained-bert (from fast-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 19.7MB/s eta 0:00:01[K     |█████▎                          | 20kB 27.5MB/s eta 0:00:01[K     |████████                        | 30kB 35.0MB/s eta 0:00:01[K     |██████████▋                     | 40kB 38.9MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 23.3MB/s eta 0:00:01[K     |███████████████▉                | 61kB 26.3MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 28.7MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 24.2MB/s eta 0:00:01[K     |███████████████████████▉     

In [0]:
!git clone https://github.com/NVIDIA/apex.git
%cd apex
#!python setup.py install --cuda_ext --cpp_ext
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .

Cloning into 'apex'...
remote: Enumerating objects: 184, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 4820 (delta 94), reused 124 (delta 64), pack-reused 4636[K
Receiving objects: 100% (4820/4820), 8.80 MiB | 2.35 MiB/s, done.
Resolving deltas: 100% (3106/3106), done.
/content/apex
  cmdoptions.check_install_build_global(options)
Created temporary directory: /tmp/pip-ephem-wheel-cache-kqjdsd7i
Created temporary directory: /tmp/pip-req-tracker-m5xash62
Created requirements tracker '/tmp/pip-req-tracker-m5xash62'
Created temporary directory: /tmp/pip-install-y45bke30
Processing /content/apex
  Created temporary directory: /tmp/pip-req-build-if3t1ukk
  Added file:///content/apex to build tracker '/tmp/pip-req-tracker-m5xash62'
    Running setup.py (path:/tmp/pip-req-build-if3t1ukk/setup.py) egg_info for package from file:///content/apex
    Running command python setup.py egg_info
    torch.__version__  = 

Download the uncased bert base to the colab space to be used for fine tuning

In [0]:
!gsutil cp gs://bert_models/2018_10_18/uncased_L-12_H-768_A-12/bert_config.json /bert_config.json

In [0]:
import pandas as pd
import os
from pathlib import Path
import logging
import datetime
import sys

import torch
import apex

from pytorch_pretrained_bert.tokenization import BertTokenizer

from fast_bert.data import BertDataBunch
from fast_bert.learner import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [0]:
#defining the data


DATA_PATH = Path('/content/')     # path for data files (train and val)
LABEL_PATH = Path('/content/')  # path for labels file
MODEL_PATH = Path('/content/')    # path for model artifacts to be stored
LOG_PATH = Path('/tmp/')  # path for log files to be stored

# location for the pretrained BERT models
BERT_PRETRAINED_PATH = '/bert_config.json'

args = {
    "run_text": "multilabel toxic comments with freezable layers",
    "max_seq_length": 512,
    "do_lower_case": True,
    "train_batch_size": 16,
    "learning_rate": 5e-6,
    "num_train_epochs": 4.0,
    "warmup_proportion": 0.1,
    "local_rank": -1,
    "gradient_accumulation_steps": 1,
    "fp16": True,
    "loss_scale": 128
}

Create Tokenizer For the Text

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=args['do_lower_case'])

In [0]:
#Set GPU requirements (if using GPU)
torch.cuda.empty_cache()
device = torch.device('cuda')

# check if multiple GPUs are available
if torch.cuda.device_count() > 1:
    multi_gpu = True
else:
    multi_gpu = False

In [0]:
#sense check the dataframe
email = pd.read_csv('/content/train_sample.csv')
email[email['insult']==1].head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
64,6625c4ada3b563c6,"You masturbate to photos of Ronald Reagan, don...",1,0,0,0,1,0
76,44114b78f9025209,That was a mistake the stupid asses at Funimat...,1,0,1,0,1,0
85,2c6e8f2c16d37d75,YOU ARE REALLY ANNOYING!!!! gO SCREW YOUR LESB...,1,0,1,0,1,1
164,40a358e41dcc21f4,IM GOING TO KILL YOU ALL!!!!!!!!!!!!!!!!!\r\nI...,1,1,0,1,1,0
196,ff15116e1bf2637c,"I WILL FUCK YOU UP, YOU IRISH PANSY \r\n\r\nFU...",1,0,1,1,1,1
220,d6cf5e3993585170,You're gonna get it son... \r\n\r\nyou horrid ...,1,0,1,0,1,0
228,8b89a833bd444b4b,", obviously Lan3y knows more about the band th...",1,0,0,0,1,0
239,8a459a7cbb2373e7,"Russian bitches, hands off the truth, get out ...",1,0,1,0,1,1
308,2bb86acd9ffa1ebb,Apologies \r\n\r\nThis IP is from a school and...,0,0,0,0,1,0
315,ed6850821b939a73,User Phanatical is a scum bag dogshit liar. ch...,1,0,1,0,1,0


In [0]:
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer,
                          train_file='train_sample.csv', val_file='val_sample.csv',
                          label_file='labels.csv',
                          text_col="comment_text",
                          label_col=labels,
                          bs=args['train_batch_size'], maxlen=args['max_seq_length'], 
                          multi_gpu=False, multi_label=True)

In [0]:
# check databunch objects
num_labels = len(databunch.labels)
print(num_labels)

6


In [0]:
#Create Logs Config

run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [0]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy_single', 'function': accuracy_multilabel})

learner = BertLearner.from_pretrained_model(databunch, 'bert-base-uncased', metrics, device, logger, 
                                            finetuned_wgts_path=None, 
                                            is_fp16=args['fp16'], loss_scale=args['loss_scale'], 
                                            multi_gpu=False,  multi_label=True)

06/30/2019 06:13:31 - INFO - pytorch_pretrained_bert.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz not found in cache, downloading to /tmp/tmpb3sq0xwe


100%|██████████| 407873900/407873900 [00:32<00:00, 12503405.18B/s]

06/30/2019 06:14:05 - INFO - pytorch_pretrained_bert.file_utils -   copying /tmp/tmpb3sq0xwe to cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba





06/30/2019 06:14:06 - INFO - pytorch_pretrained_bert.file_utils -   creating metadata file for /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
06/30/2019 06:14:06 - INFO - pytorch_pretrained_bert.file_utils -   removing temp file /tmp/tmpb3sq0xwe
06/30/2019 06:14:06 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
06/30/2019 06:14:06 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpr8pi26w3
06/30/2019 06:14:11 -

In [0]:
learner.fit(8, lr=args['learning_rate'], 
            schedule_type="warmup_cosine_hard_restarts")

06/30/2019 06:41:28 - INFO - root -   Loss after epoch 0 - 0.07366773817274305
06/30/2019 06:41:28 - INFO - root -   Running evaluation


06/30/2019 06:42:01 - INFO - root -   Eval results:
06/30/2019 06:42:01 - INFO - root -     eval_loss = 0.08563353523375496
06/30/2019 06:42:01 - INFO - root -     metrics = {'accuracy_thresh': 0.9763333201408386, 'roc_auc': 0.9625350761736768, 'fbeta': 0.049258552491664886, 'accuracy_single': 0.997}
06/30/2019 06:42:01 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:42:44 - INFO - root -   Loss after epoch 1 - 0.06239464169456845
06/30/2019 06:42:44 - INFO - root -   Running evaluation


06/30/2019 06:43:17 - INFO - root -   Eval results:
06/30/2019 06:43:17 - INFO - root -     eval_loss = 0.07458592974950397
06/30/2019 06:43:17 - INFO - root -     metrics = {'accuracy_thresh': 0.9764999747276306, 'roc_auc': 0.9620443957248562, 'fbeta': 0.0379931703209877, 'accuracy_single': 0.996}
06/30/2019 06:43:17 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:43:59 - INFO - root -   Loss after epoch 2 - 0.05599636501736111
06/30/2019 06:43:59 - INFO - root -   Running evaluation


06/30/2019 06:44:32 - INFO - root -   Eval results:
06/30/2019 06:44:32 - INFO - root -     eval_loss = 0.07405938042534722
06/30/2019 06:44:32 - INFO - root -     metrics = {'accuracy_thresh': 0.9771666526794434, 'roc_auc': 0.960621084688454, 'fbeta': 0.03656427189707756, 'accuracy_single': 0.992}
06/30/2019 06:44:32 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:45:15 - INFO - root -   Loss after epoch 3 - 0.05253019787016369
06/30/2019 06:45:15 - INFO - root -   Running evaluation


06/30/2019 06:45:48 - INFO - root -   Eval results:
06/30/2019 06:45:48 - INFO - root -     eval_loss = 0.07406349787636408
06/30/2019 06:45:48 - INFO - root -     metrics = {'accuracy_thresh': 0.9766666293144226, 'roc_auc': 0.9633596316181552, 'fbeta': 0.044891662895679474, 'accuracy_single': 0.996}
06/30/2019 06:45:48 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:46:31 - INFO - root -   Loss after epoch 4 - 0.05143519810267857
06/30/2019 06:46:31 - INFO - root -   Running evaluation


06/30/2019 06:47:04 - INFO - root -   Eval results:
06/30/2019 06:47:04 - INFO - root -     eval_loss = 0.07406349787636408
06/30/2019 06:47:04 - INFO - root -     metrics = {'accuracy_thresh': 0.9766666293144226, 'roc_auc': 0.9633596316181552, 'fbeta': 0.044891662895679474, 'accuracy_single': 0.996}
06/30/2019 06:47:04 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:47:47 - INFO - root -   Loss after epoch 5 - 0.05135309128534226
06/30/2019 06:47:47 - INFO - root -   Running evaluation


06/30/2019 06:48:20 - INFO - root -   Eval results:
06/30/2019 06:48:20 - INFO - root -     eval_loss = 0.07406349787636408
06/30/2019 06:48:20 - INFO - root -     metrics = {'accuracy_thresh': 0.9766666293144226, 'roc_auc': 0.9633596316181552, 'fbeta': 0.044891662895679474, 'accuracy_single': 0.996}
06/30/2019 06:48:20 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:49:03 - INFO - root -   Loss after epoch 6 - 0.05138772631448413
06/30/2019 06:49:03 - INFO - root -   Running evaluation


06/30/2019 06:49:36 - INFO - root -   Eval results:
06/30/2019 06:49:36 - INFO - root -     eval_loss = 0.07406349787636408
06/30/2019 06:49:36 - INFO - root -     metrics = {'accuracy_thresh': 0.9766666293144226, 'roc_auc': 0.9633596316181552, 'fbeta': 0.044891662895679474, 'accuracy_single': 0.996}
06/30/2019 06:49:36 - INFO - root -   --------------------------------------------------------------------------------
06/30/2019 06:50:18 - INFO - root -   Loss after epoch 7 - 0.05100286574590774
06/30/2019 06:50:18 - INFO - root -   Running evaluation


06/30/2019 06:50:51 - INFO - root -   Eval results:
06/30/2019 06:50:51 - INFO - root -     eval_loss = 0.07406349787636408
06/30/2019 06:50:51 - INFO - root -     metrics = {'accuracy_thresh': 0.9766666293144226, 'roc_auc': 0.9633596316181552, 'fbeta': 0.044891662895679474, 'accuracy_single': 0.996}
06/30/2019 06:50:51 - INFO - root -   --------------------------------------------------------------------------------


In [0]:
# test prediction
texts = [
  "this jerk beats me to death",
  "Hi sweetie, wanna meet me tonight?",
  "You better come here, otherwise I will kill your wife!",
  "go fuck yourself!",
  "Let me help you fix the problem here",
  "uisdiouioau roiauworiawu"
]

predictions = learner.predict_batch(texts)

for sentence,classes in zip(texts,predictions):
  print(sentence+': ',classes)

this jerk beats me to death:  [('toxic', 0.69921875), ('obscene', 0.517578125), ('insult', 0.456298828125), ('severe_toxic', 0.36669921875), ('threat', 0.35302734375), ('identity_hate', 0.24609375)]
Hi sweetie, wanna meet me tonight?:  [('toxic', 0.05877685546875), ('obscene', 0.03277587890625), ('insult', 0.03192138671875), ('identity_hate', 0.022705078125), ('severe_toxic', 0.0205230712890625), ('threat', 0.017242431640625)]
You better come here, otherwise I will kill your wife!:  [('toxic', 0.63525390625), ('obscene', 0.4345703125), ('insult', 0.384033203125), ('severe_toxic', 0.280029296875), ('threat', 0.261962890625), ('identity_hate', 0.178955078125)]
go fuck yourself!:  [('toxic', 0.7021484375), ('obscene', 0.541015625), ('insult', 0.4697265625), ('severe_toxic', 0.39208984375), ('threat', 0.3759765625), ('identity_hate', 0.2666015625)]
Let me help you fix the problem here:  [('toxic', 0.05377197265625), ('insult', 0.03192138671875), ('obscene', 0.029541015625), ('identity_hate

In [0]:
from fast_bert.prediction import BertClassificationPredictor

predictor = BertClassificationPredictor(model_path=MODEL_PATH, pretrained_path=BERT_PRETRAINED_PATH, 
                                        label_path=LABEL_PATH, multi_label=False)

# Single prediction
single_prediction = predictor.predict("test this nonsense for me")

# Batch predictions
texts = [
  "this jerk beats me to death",
  "Hi sweetie, wanna meet me tonight?"
]

multiple_predictions = predictor.predict(texts)

# **Another Test with Wrapper for TensorFlow**

In [0]:
!pip install bert-text

In [0]:
from bert_text import run_on_dfs
import pickle

In [0]:
myparam = {
    "DATA_COLUMN": "text",
    "LABEL_COLUMN": "spam",
    "LEARNING_RATE": 2e-5,
    "NUM_TRAIN_EPOCHS": 3
}

train = 

In [0]:
!wget https://github.com/wshuyi/info-5731-public/raw/master/imdb-sample.pickle

In [0]:
with open("imdb-sample.pickle", 'rb') as f:
  train, test = pickle.load(f)

In [0]:
train=pd.read_csv('/train.csv')
test=pd.read_csv('/vals.csv')

In [0]:
train.shape

In [0]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
result, estimator = run_on_dfs(train, test, **myparam)

In [0]:
result

In [0]:
def getPrediction(in_sentences):
  labels = ["Negative", "Positive"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [0]:
pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

In [0]:
predictions = getPrediction(pred_sentences)