In [None]:
!pip install tensorflow-addons

In [None]:
import pickle
from functools import reduce
import re
from tqdm import tqdm

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
path = '/content/drive/MyDrive/tr_project/'

In [None]:
all_sentences = pickle.load(open(f'{path}all_sentences_processed.pkl', 'rb'))
all_labels = pickle.load(open(f'{path}all_labels_processed.pkl', 'rb'))

In [None]:
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(all_sentences)

In [None]:
sequences = tokenizer.texts_to_sequences(all_sentences)

In [None]:
n = len(sequences)
k = len(tokenizer.index_word) + 1
t = 500
h = 256
batch_size = 128
epochs = 50

n, k

In [None]:
len(tokenizer.index_word.values())

In [None]:
lens = [len(s) for s in sequences]
min(lens), max(lens)

In [None]:
def label_seq(seq, labels):
  new_seq = []
  label_seq = []
  for i, c in enumerate(seq[:-1]):
    if i in labels:
      continue
      
    new_seq.append(c)

    if i+1 in labels:
      next_char = tokenizer.index_word[seq[i+1]]
      if next_char == "'":
        label_seq.append(1)
      elif next_char == '"':
        label_seq.append(2)
      else:
        raise ValueError('Noooo!')
    else:
      label_seq.append(0)

  new_seq.append(seq[-1])
  label_seq.append(0)
  
  return new_seq, label_seq

  
str(sequences[15]), str(label_seq(sequences[15], all_labels[15]))

In [None]:
new_seq, _ = label_seq(sequences[15], all_labels[15])
tokenizer.sequences_to_texts([new_seq])

In [None]:
def create_tensors(sequences, labels):
  seq_labels = [label_seq(s, l) for s, l in zip(sequences, labels)]
  sequences = [s for s, l in seq_labels]
  labels = [l for s, l in seq_labels]
  weights = [[1] * len(l) for l in labels]

  padded_sequences = pad_sequences(sequences, padding='post', maxlen=t)
  padded_labels = pad_sequences(labels, padding='post', maxlen=t)
  weights = pad_sequences(weights, padding='post', maxlen=t)
  weights[padded_labels > 0] = 100

  return padded_sequences, padded_labels, weights

In [None]:
inputs, outputs, sample_weights = create_tensors(sequences, all_labels)

In [None]:
inputs.shape, outputs.shape, inputs.dtype

In [None]:
inputs[1_000]

In [None]:
outputs[1_000]

In [None]:
sample_weights[1_000]

In [None]:
train_inputs, val_inputs, train_outputs, val_outputs, train_sample_weights, val_sample_weights = (
    train_test_split(
        inputs, 
        outputs, 
        sample_weights, 
        test_size=0.1, 
        stratify=outputs.max(axis=-1)
    )
)

In [None]:
def to_tf_dataset(inputs, outputs, batch_size):
  dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
  dataset = dataset.map(lambda x, y: (x, tf.expand_dims(tf.cast(y, 'float32'), 1)))
  dataset = dataset.batch(batch_size)

  return dataset

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.map(lambda x, y: (x, tf.expand_dims(tf.cast(y, 'float32'), 1)))
dataset = dataset.batch(batch_size)

In [None]:
train_dataset = to_tf_dataset(train_inputs, train_outputs, batch_size)
val_dataset = to_tf_dataset(val_inputs, val_outputs, batch_size)

In [None]:
1 / (train_outputs.sum() / train_outputs.size)

In [None]:
train_outputs.sum(), train_outputs.size - train_outputs.sum()

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(t,)),
        tf.keras.layers.Embedding(k, h, name='char_emb', input_length=t),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(h, name='encoder', input_shape=(k, h), return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(h, return_sequences=True, name='decoder')),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(3, activation="softmax", name='predictor')),
    ],
    name='char_binary_lstm'
)
optimizer = tf.keras.optimizers.Adam()
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=5,
    restore_best_weights=True,
)
save_ckpt = tf.keras.callbacks.ModelCheckpoint(f'{path}char_lstm', save_best_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(f'{path}training.log')

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer=optimizer,
    sample_weight_mode='temporal'
)
model.summary()

In [None]:
model.fit(train_inputs, train_outputs, validation_data=val_dataset, epochs=epochs, sample_weight=np.expand_dims(train_sample_weights, -1), callbacks=[early_stopping, save_ckpt, csv_logger])

In [None]:
y_pred = model.predict(val_dataset)
y_pred = y_pred.argmax(-1)
y_true = val_dataset.map(lambda x, y: y)
y_true = np.concatenate([x for x in y_true], axis=0)

y_true = y_true.reshape(-1, 1).squeeze()
y_pred = y_pred.reshape(-1, 1).squeeze()

f1_score(y_true, y_pred, average='weighted')

In [None]:
print(classification_report(y_true, y_pred))