# About

See what BERT is all about. Apparently lots of these blogs use pytorch

### News Topic Similarity Measure using Pretrained BERT Model

https://medium.com/the-artificial-impostor/news-topic-similarity-measure-using-pretrained-bert-model-1dbfe6a66f1d

https://github.com/ceshine/pytorch-pretrained-BERT/blob/master/notebooks/Next%20Sentence%20Prediction.ipynb

https://anaconda.org/conda-forge/pytorch-pretrained-bert

various implementations of BERT

https://pypi.org/project/pytorch-pretrained-bert/

get the pytorch

https://github.com/huggingface/transformers


### A Simple Guide On Using BERT for Binary Text Classification.

https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04


### BERT in Keras with Tensorflow hub

https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

https://github.com/strongio/keras-bert

https://github.com/strongio/keras-bert/blob/master/keras-bert.ipynb

https://tfhub.dev/google


### Understanding searches better than ever before

google's white paper

https://www.blog.google/products/search/search-language-understanding-bert


# This part is a replication of the news topic similarity blog post

In [1]:
import json
import gc

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from pytorch_pretrained_bert.modeling import BertForNextSentencePrediction
from pytorch_pretrained_bert.tokenization import BertTokenizer
# from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear, SCHEDULES
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
# data prep
# we're not using NYTimes article feed
# instead of the News Title vs News Body, we have Question Title vs Question Body

p_questions = "/mnt/disks/disk-1-w210-data/data/interim/stackexchange/stackoverflow/PostQuestionsFiltered_V3_parsed.tsv"
questions = pd.read_csv(p_questions, delimiter = "\t", encoding = "utf-8")

p_answers = "/mnt/disks/disk-1-w210-data/data/interim/stackexchange/stackoverflow/PostAnswersFiltered_V3_parsed.tsv"
answers = pd.read_csv(p_answers, delimiter = "\t", encoding = "utf-8")

  interactivity=interactivity, compiler=compiler, result=result)


# sample

In [4]:
t1 = questions[questions["accepted_answer_id"] > 0]
questions_sample = t1.sample(600, random_state = 20191102)

In [5]:
questions_sample.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,...,seaborn,geospatial,stata,plyr,pie-chart,graphviz,spss,diagram,qlikview,altair
557822,557822,557822,17108288,Why is my CSS :hover @keyframes animation not ...,<p>I am a newbie. Why is this code not working...,17108610.0,1,18,,2013-06-14 12:16:20.233000+00:00,...,0,0,0,0,0,0,0,0,0,0
209952,209952,209952,13656097,Mixing line and scatterplot in ggplot,<p>I've looked around a fair bit but I am stum...,13656277.0,2,0,,2012-12-01 03:35:01.673000+00:00,...,0,0,0,0,0,0,0,0,0,0
275911,275911,275911,395599,Java graphic library for multicoloured text,<p>I would like to know the recommended librar...,395679.0,4,0,,2008-12-27 23:44:33.840000+00:00,...,0,0,0,0,0,0,0,0,0,0


# corpus

In [6]:
# constructs corpus
# with question id, title, accepted answer id, answer body
def construct_corpus(questions, answers = answers):
    t1 = questions[["id", "title", "tags", "accepted_answer_id"]].rename(columns = {"id" : "q_id", "title" : "q_title"})
    t2 = answers[["id", "body"]].rename(columns = {"id" : "a_id", "body" : "a_body"})
    t3 = t1.merge(t2, left_on = "accepted_answer_id", right_on = "a_id", how = "inner").drop(columns = "a_id")
    return(t3)

In [7]:
corpus = construct_corpus(questions_sample)
corpus.head(3)

Unnamed: 0,q_id,q_title,tags,accepted_answer_id,a_body
0,17108288,Why is my CSS :hover @keyframes animation not ...,htmlcss3cross-browsercss-animations,17108610.0,<p>here it is:</p>\r\n\r\n<p>u have to add ven...
1,13656097,Mixing line and scatterplot in ggplot,rggplot2,13656277.0,"<p>As @MattBagg has pointed out, this issue is..."
2,395599,Java graphic library for multicoloured text,javagraphics,395679.0,<p>I'm assuming you're rendering text to an ar...


In [13]:
# writes corpus to interim location
corpus.to_csv("../data/interim/yyq_sample_corpus.tsv")
corpus.to_excel("../data/interim/yyq_sample_corpus.xlsx")

# helper functions from the blog

In [8]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

In [9]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [10]:
def convert_sentence_pair(titles, descs, max_seq_length, tokenizer):
    features = []
    for (ex_index, (title, desc)) in enumerate(zip(titles, descs)):
        tokens_a = tokenizer.tokenize(title)

        tokens_b = None
        tokens_b = tokenizer.tokenize(desc)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

        features.append(
                InputFeatures(
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    target=1
        ))
    return features

In [11]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True, 
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

NameError: name 'PYTORCH_PRETRAINED_BERT_CACHE' is not defined