In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVR
import pickle

from sklearn.model_selection import train_test_split
import gc
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [2]:
## Constants
target_key = "score"
text_key = "txt"
batch_size = 32
bert_path = "bert-base-uncased"
train_path = "./train/civil_with_downsample.csv"
lstm_hidden_dim = 64 # from 768 (bert) to 64  # the decrease is steep; u may lose info
dropout_rate = 0.3
output_dim = 1
max_length = 512
n_epochs = 1
# checkpoint_filepath = "./bert/ckpt-loss={loss:.5f}-epoch={epoch}-batch={batch}"
# final_train_filepath = "./bert/final_model"
save_after_batches = 5000
test_size_percent =  0.05
Nystroem_n_components = 1536 #3000 # dunno # using 1%


In [3]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_path, do_lower_case=True)

In [4]:
# text = "what's up?! ...... 🤗                 \n\t" # space \n \t dont matter # emoji to [UNK] ! #punctuations are important/have indices
# text = "I have a car. I've a car." # different indices but reconstructs fine
# print(text)
# input_ids = tokenizer.batch_encode_plus([text])["input_ids"][0] # input_ids = [101, 2054, 1005, 1055, 2039, 1029, 102] 
# print(input_ids)
# decoded = tokenizer.decode(input_ids)
# print(decoded)

In [4]:
whole_df = pd.read_csv(train_path)
whole_df.shape

(1109778, 2)

In [5]:
nontoxic_df = whole_df[whole_df["score"] == 0] # 50%
toxic_df =  whole_df[whole_df["score"] != 0] # 50 %
subset_df = pd.concat([nontoxic_df.sample(frac=0.1, random_state=23), toxic_df.sample(frac=0.5, random_state=24)], ignore_index=True) # ie 5% of whole (and non toxic) and 25% of whole( and toxic)
print(subset_df.shape) # 30% of whole
subset_df.score.value_counts()

(339660, 2)


0.000000    53807
0.333333    49159
0.400000    33908
0.166667    13224
0.800000    12119
            ...  
0.204545        1
2.121622        1
3.771429        1
2.965517        1
1.528571        1
Name: score, Length: 8710, dtype: int64

In [6]:
class CivilDataGenerator(tf.keras.utils.Sequence): # could optimize more like BucketIterator for padding
    def __init__(self, texts, scores, tokenizer, batch_size=batch_size, shuffle=True, include_targets=True): # texts -> numpy array
        self.texts = texts
        self.scores = scores
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        self.tokenizer =  tokenizer # 
        self.indexes = np.arange(len(self.texts))
        self.on_epoch_end()
        
    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.texts) // self.batch_size + 1 if (len(self.texts) % self.batch_size) != 0 else 0
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)
            
    def __getitem__(self, idx): # idx -> index batch
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        texts = self.texts[indexes]
        
        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            texts.tolist(), # num
            add_special_tokens=True, # not really needed in our case. 
            max_length=max_length, # bert has 512 max length # providing our own
            return_attention_mask=True, # need bcos to pad to max length
            return_token_type_ids=False, # not needed # needed when u have two sentences
            padding='max_length', #pad_to_max_length=True, # needed
            return_tensors="tf",
            truncation=True,
        )
        
        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        
        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            scores = np.array(self.scores[indexes], dtype="float32")
            return [input_ids, attention_masks], scores
        else:
            return [input_ids, attention_masks]
        
        

In [8]:
# Encoded token ids from BERT tokenizer.
input_ids = tf.keras.layers.Input(
    shape=(max_length,), dtype=tf.int32, name="input_ids"
)
# Attention masks indicates to the model which tokens should be attended to.
attention_masks = tf.keras.layers.Input(
    shape=(max_length,), dtype=tf.int32, name="attention_masks"
)

# Loading pretrained BERT model.
bert_model = transformers.TFBertModel.from_pretrained(bert_path)
# Freeze the BERT model to reuse the pretrained features without modifying them.
bert_model.trainable = False ## not training bert ##

bert_output = bert_model.bert(input_ids, attention_mask=attention_masks) # by default hidden_size = 768
sequence_output = bert_output.last_hidden_state  # (batch_size, sequence_length, hidden_size) # ie each word representation 

avg_pool = tf.keras.layers.GlobalAveragePooling1D()(sequence_output, attention_masks) # averages over sequence length # (hidden_size)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(sequence_output) # (batch_size, hidden_size)
concat = tf.keras.layers.concatenate([avg_pool, max_pool]) #(batch_size, hidden_size*2)

model = tf.keras.models.Model(
    inputs=[input_ids, attention_masks], outputs=concat
)

# doesnt matter compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss= 'mse',# tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.RootMeanSquaredError()],
)

model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_masks[0][0]            
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 768)          0           bert[0][0]                   

In [9]:
test_data = CivilDataGenerator(
    subset_df[text_key].values,
    None, # no target while inferring
    tokenizer,
    batch_size=batch_size,
    shuffle=False,
    include_targets=False # added for inference
)

In [17]:
for batch in test_data:
#     print(len(batch))# batch is list of [np.array(input_ids), p.array(attention_mask)] # each is (batch, ...)
    example = batch[0][0]
    mask = batch[1][0]
    example = example[np.newaxis, :]
    mask = mask[np.newaxis, :]
    # example[:,:] = 1000
    # mask[:,:] = 1
    print(example.shape, mask.shape)
#     print(example)
#     print(mask)
    embeddings = model.predict(
        [example, mask], #batch,
        use_multiprocessing=True, # can only be used when x, y are generators
        workers=-1,
        verbose=1
    )
    print(embeddings.shape)
    print(embeddings)
    break
# normal (no change) -> [[0.03166725 0.03764773 0.27967063 ... 1.0509706  1.3395917  1.1413577 ]]

(1, 512) (1, 512)
(1, 1536)
[[-0.12658632  0.13003351 -0.12224972 ...  0.05191765  0.73312306
   0.6112654 ]]


In [None]:
with tf.device('/device:GPU:0'):
    embeddings = model.predict(
        test_data,
        use_multiprocessing=True, # can only be used when x, y are generators
        workers=-1,
        verbose=1
    ) # whole takes 4:46:47 time and subset takes 1:28:00 time

embeddings.shape

In [None]:
with open('./bert-embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

y = subset_df[target_key].values
with open('./bert-embeddings-score.npy', 'wb') as f:
    np.save(f, y)

subset_df.to_csv("subset_df.csv", index=False)

In [None]:
# with open('../input/jisaw-bert-svm-embedding-score-only/bert-embeddings.npy', 'rb') as f:
#     embeddings = np.load(f)

# with open('../input/jisaw-bert-svm-embedding-score-only/bert-embeddings-score.npy', 'rb') as f:
#     y = np.load(f)
    
# embeddings.shape, y.shape

In [None]:
# pca_model = PCA(n_components=700) # not so great reduction in dimen
# pca_model.fit(embeddings)
# print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

# doing PCA first and then downsampling -> better PCA with more  embeddings I hope?

In [None]:
# frac = 0.2
# n_samples = int(frac*len(embeddings))
# embeddings, y = embeddings[:n_samples], y[:n_samples]
# embeddings.shape, y.shape

In [None]:
# tr_ind, val_ind = train_test_split(list(range(len(embeddings))) ,test_size = test_size_percent, random_state = 23)
# len(tr_ind), len(val_ind)

In [None]:
# x_train = embeddings[tr_ind]
# y_train = y[tr_ind]
# x_test = embeddings[val_ind]
# y_test = y[val_ind]
# x_train.shape, y_train.shape, x_test.shape, y_test.shape

# x_train =  pca_model.transform(embeddings[tr_ind]) 
# y_train = y[tr_ind]
# x_test = pca_model.transform(embeddings[val_ind]) 
# y_test = y[val_ind]
# x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# pd.value_counts(y_train), pd.value_counts(y_test)

In [None]:
# del embeddings, y
# gc.collect()

In [None]:
# feature_map_nystroem = Nystroem(kernel = 'rbf', gamma=.2, random_state=23, n_components=Nystroem_n_components) #n_jobs=-1
# data_transformed = feature_map_nystroem.fit_transform(x_train)
# data_transformed.shape

In [None]:
# del x_train
# gc.collect()

In [None]:
# feature_map_nystroem.normalization_
# pickle.dump(feature_map_nystroem, open("./train/feature_map_nystroem", 'wb'))

In [None]:
# regr = LinearSVR(random_state=0, tol=1e-5, loss="squared_epsilon_insensitive")
# regr.fit(data_transformed, y_train)

# regr = SVR(C=1.0, epsilon=0.2)
# regr.fit(x_train, y_train)

In [None]:
# pickle.dump(regr, open("./regr", 'wb'))

In [None]:
# data_transformed_test = feature_map_nystroem.transform(x_test)
# data_transformed_test.shape

In [None]:
# print(regr.score(data_transformed_test, y_test))
# preds = regr.predict(data_transformed_test)
# mean_squared_error(y_test, preds)

# print(regr.score(x_test, y_test))
# preds = regr.predict(x_test)
# mean_squared_error(y_test, preds)