In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
import pandas as pd

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_len = 30
bert_embedding_dim = 768
sess = tf.Session()

In [2]:
def bert_tokenizer(sess):
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file = sess.run(tokenization_info["vocab_file"])
    do_lower_case = sess.run(tokenization_info["do_lower_case"])
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [3]:
tokenizer = bert_tokenizer(sess)

W0912 21:19:05.218722 4674532800 deprecation_wrapper.py:119] From /Users/zetong/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [4]:
def process_sample(tokenizer, text, max_seq_len):
    tokenized = tokenizer.tokenize(text)
    if len(tokenized) > max_seq_len-2:
        tokenized = tokenized[0:max_seq_len-2]
    tokens = []
    tokens.append("[CLS]")
    tokens.extend(tokenized)
    tokens.append("[SEP]")
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(token_ids)
    seg_ids = [0] * len(token_ids)
    while len(token_ids) < max_seq_len:
        token_ids.append(0)
        input_mask.append(0)
        seg_ids.append(0)
    return np.asarray(token_ids), np.asarray(input_mask), np.asarray(seg_ids)

In [5]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [6]:
def bert_embedding(max_seq_len=max_seq_len):
    in_id = tf.keras.layers.Input(shape=(max_seq_len,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_len,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_len,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    bert_output = BertLayer(n_fine_tune_layers=0, pooling="first")(bert_inputs)
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_output)
    model.summary()
    return model

In [7]:
bert_embedding = bert_embedding()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 30)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 30)]         0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [8]:
def single_query(model, tokenizer, text, max_seq_len):
    i1, i2, i3 = process_sample(tokenizer, text, max_seq_len)
    i1 = np.reshape(i1, (1, max_seq_len))
    i2 = np.reshape(i2, (1, max_seq_len))
    i3 = np.reshape(i3, (1, max_seq_len))
    return np.reshape(model.predict([i1, i2, i3]), bert_embedding_dim)

In [9]:
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)
initialize_vars(sess)

In [11]:
with open("/Users/zetong/Facebook_privacy_entities.csv", mode = 'r', encoding = 'ascii', errors = 'ignore') as csvfile:
    entities = pd.read_csv(csvfile, header=None)
    texts = list(entities[0])
    labels = list(entities[1])

In [12]:
entities

Unnamed: 0,0,1
0,Who can send you friend requests?,1
1,Who can see your friends list?,1
2,Who can look you up using the email address yo...,1
3,Who can look you up using the phone number you...,1
4,Do you want search engines outside of Facebook...,1
5,Who can see your future posts?,2
6,Limit The Audience for Old Posts on Your Timel...,2
7,Review all your posts and things you're tagged...,2
8,Who can post on your timeline?,3
9,Who can see what others post on your timeline?,3


In [13]:
X = []
for text in texts:
    X.append(single_query(bert_embedding, tokenizer, text, max_seq_len))

In [14]:
X = np.asarray(X)

In [15]:
X.shape

(22, 768)

In [16]:
import sklearn.metrics

In [28]:
query = single_query(bert_embedding, tokenizer, "who can befriend me", max_seq_len)
dist = []
for i in range(X.shape[0]):
    dist.append(sklearn.metrics.pairwise.cosine_similarity(np.reshape(X[i], (1, bert_embedding_dim)), np.reshape(query, (1, bert_embedding_dim))))

In [29]:
texts[np.argmin(dist)]

'Do Not Personalize Ads from Amazon for this Internet Browser'