In [31]:
!pip install tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import pickle
from functools import reduce
import re
from tqdm import tqdm

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer

In [33]:
path = '/content/drive/MyDrive/tr_project/'

In [34]:
all_sentences = pickle.load(open(f'{path}all_sentences_processed.pkl', 'rb'))
all_labels = pickle.load(open(f'{path}all_labels_processed.pkl', 'rb'))

In [35]:
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(all_sentences)

In [36]:
sequences = tokenizer.texts_to_sequences(all_sentences)

In [37]:
n = len(sequences)
k = len(tokenizer.index_word) + 1
t = 500
h = 256
batch_size = 128
epochs = 50

n, k

(327325, 74)

In [38]:
len(tokenizer.index_word.values())

73

In [39]:
lens = [len(s) for s in sequences]
min(lens), max(lens)

(9, 501)

In [40]:
def label_seq(seq, labels):
  new_seq = []
  label_seq = []
  for i, c in enumerate(seq[:-1]):
    if i in labels:
      continue
      
    new_seq.append(c)

    if i+1 in labels:
      next_char = tokenizer.index_word[seq[i+1]]
      if next_char == "'":
        label_seq.append(1)
      elif next_char == '"':
        label_seq.append(2)
      else:
        raise ValueError('Noooo!')
    else:
      label_seq.append(0)

  new_seq.append(seq[-1])
  label_seq.append(0)
  
  return new_seq, label_seq

  
str(sequences[15]), str(label_seq(sequences[15], all_labels[15]))

('[6, 24, 25, 2, 1, 7, 5, 4, 1, 27, 7, 5, 20, 7, 23, 1, 19, 2, 9, 10, 3, 19, 8, 22, 1, 6, 24, 25, 2, 1, 17, 5, 4, 1, 3, 1, 21, 3, 12, 1, 15, 2, 15, 5, 9, 18, 23]',
 '([6, 25, 2, 1, 7, 5, 4, 1, 27, 7, 5, 20, 7, 23, 1, 19, 2, 9, 10, 3, 19, 8, 22, 1, 6, 25, 2, 1, 17, 5, 4, 1, 3, 1, 21, 3, 12, 1, 15, 2, 15, 5, 9, 18, 23], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])')

In [41]:
new_seq, _ = label_seq(sequences[15], all_labels[15])
tokenizer.sequences_to_texts([new_seq])

['i v e   n o t   k n o w n .   p e r h a p s ,   i v e   g o t   a   b a d   m e m o r y .']

In [42]:
def create_tensors(sequences, labels):
  seq_labels = [label_seq(s, l) for s, l in zip(sequences, labels)]
  sequences = [s for s, l in seq_labels]
  labels = [l for s, l in seq_labels]
  weights = [[1] * len(l) for l in labels]

  padded_sequences = pad_sequences(sequences, padding='post', maxlen=t)
  padded_labels = pad_sequences(labels, padding='post', maxlen=t)
  weights = pad_sequences(weights, padding='post', maxlen=t)
  weights[padded_labels > 0] = 100

  return padded_sequences, padded_labels, weights

In [43]:
inputs, outputs, sample_weights = create_tensors(sequences, all_labels)

In [44]:
inputs.shape, outputs.shape, inputs.dtype

((327325, 500), (327325, 500), dtype('int32'))

In [45]:
inputs[1_000]

array([ 3,  7, 18, 20,  3, 18,  1, 20,  2, 11, 11,  1,  8,  5,  9,  4,  1,
        4, 10,  1, 18,  2,  3, 10,  1, 20,  2, 11, 11,  1,  8,  5,  9,  4,
        1,  4, 10,  3,  4,  1,  5, 14,  4, 23,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [46]:
outputs[1_000]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [47]:
sample_weights[1_000]

array([  1,   1,   1,   1,   1,   1,   1,   1, 100,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
       100,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [48]:
train_inputs, val_inputs, train_outputs, val_outputs, train_sample_weights, val_sample_weights = (
    train_test_split(
        inputs, 
        outputs, 
        sample_weights, 
        test_size=0.1, 
        stratify=outputs.max(axis=-1)
    )
)

In [49]:
def to_tf_dataset(inputs, outputs, batch_size):
  dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
  dataset = dataset.map(lambda x, y: (x, tf.expand_dims(tf.cast(y, 'float32'), 1)))
  dataset = dataset.batch(batch_size)

  return dataset

In [50]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.map(lambda x, y: (x, tf.expand_dims(tf.cast(y, 'float32'), 1)))
dataset = dataset.batch(batch_size)

In [51]:
train_dataset = to_tf_dataset(train_inputs, train_outputs, batch_size)
val_dataset = to_tf_dataset(val_inputs, val_outputs, batch_size)

In [52]:
1 / (train_outputs.sum() / train_outputs.size)

193.50550053993837

In [53]:
train_outputs.sum(), train_outputs.size - train_outputs.sum()

(761198, 146534802)

In [54]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(t,)),
        tf.keras.layers.Embedding(k, h, name='char_emb', input_length=t),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(h, name='encoder', input_shape=(k, h), return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(h, return_sequences=True, name='decoder')),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(3, activation="softmax", name='predictor')),
    ],
    name='char_binary_lstm'
)
optimizer = tf.keras.optimizers.Adam()
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=5,
    restore_best_weights=True,
)
save_ckpt = tf.keras.callbacks.ModelCheckpoint(f'{path}char_lstm', save_best_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(f'{path}training.log')

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer=optimizer,
    sample_weight_mode='temporal'
)
model.summary()

Model: "char_binary_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 char_emb (Embedding)        (None, 500, 256)          18944     
                                                                 
 bidirectional_4 (Bidirectio  (None, 500, 512)         1050624   
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 500, 512)         1574912   
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 500, 3)           1539      
 tributed)                                                       
                                                                 
Total params: 2,646,019
Trainable params: 2,646,019
Non-trainable params: 0
________________________________________

In [55]:
model.fit(train_inputs, train_outputs, validation_data=val_dataset, epochs=epochs, sample_weight=np.expand_dims(train_sample_weights, -1), callbacks=[early_stopping, save_ckpt, csv_logger])

Epoch 1/50



Epoch 2/50
Epoch 3/50



Epoch 4/50



Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


<keras.callbacks.History at 0x7f8395bf1df0>

In [56]:
y_pred = model.predict(val_dataset)
y_pred = y_pred.argmax(-1)
y_true = val_dataset.map(lambda x, y: y)
y_true = np.concatenate([x for x in y_true], axis=0)

y_true = y_true.reshape(-1, 1).squeeze()
y_pred = y_pred.reshape(-1, 1).squeeze()

f1_score(y_true, y_pred, average='weighted')



0.9916867812449999

In [57]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99  16308597
         1.0       0.53      0.97      0.68     31855
         2.0       0.12      0.94      0.21     26048

    accuracy                           0.99  16366500
   macro avg       0.55      0.97      0.63  16366500
weighted avg       1.00      0.99      0.99  16366500

