In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install seqeval

In [None]:
import pickle
from functools import reduce
import re
from tqdm import tqdm

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import nltk
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.utils import pad_sequences

from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

from datasets import load_dataset
import evaluate

In [None]:
path = '/content/drive/MyDrive/tr_project/'

In [None]:
nltk.download('punkt')

In [None]:
all_sentences = pickle.load(open(f'{path}all_sentences_processed.pkl', 'rb'))
all_labels = pickle.load(open(f'{path}all_labels_processed.pkl', 'rb'))

In [None]:
len(all_sentences)

In [None]:
def apos_pos_analysis(texts):
  i_s = []
  max_i = 0
  for text in texts:
    words = nltk.word_tokenize(text)
    for word in words:
      try:
        i = word.index("'")
        i = len(word) - i
        if i > max_i:
          max_i = i
          print(f'{word}: {i}')
        i_s.append(i)
      except:
        pass

  print(max(i_s))

In [None]:
# i've not known. 91 31 110
# i've not known. 91 31 110, 3, 0, 0
# ive not known. x 31 110, 3, 0, 0

In [None]:
def find_word_from_i(sent, i):
  j = i
  k = i
  while j > 0 and sent[j] != ' ':
    j -= 1

  while k < len(sent) and sent[k] != ' ':
    k += 1

  return sent[j:k].strip()

In [None]:
find_word_from_i(all_sentences[15], 1)

In [None]:
def index_safe(s, c):
  try:
    return s.index(c)
  except:
    return -1

def count(s, c):
  x = 0
  for i in range(len(s)):
    if s[i] == c:
      x += 1
  return  x

In [None]:
def label_seq(sent, labels):
  new_seq = []
  label_i = 0
  char_c = 0
  reg_tok = nltk.RegexpTokenizer('[\w\'"]+|[.,;:?!\-\(\)]')
  seq = reg_tok.tokenize(sent)
  label_seq = [0] * len(seq)

  for i, c in enumerate(seq):
    try:
      word_len = len(c)
      j = index_safe(c, "'")
      if j != -1:
        j_ = len(c) - j
        if j_ <= 6:
          label_seq[i] = j_

        label_i += 1
      
      j = count(c, '"')
      if j > 0:
        label_i += 1

      if j == 1:
        j = index_safe(c, '"')
        if j == 0:
          label_seq[i] = 7
        else:
          label_seq[i] = 8
      elif j > 1:
        label_seq[i] = 9
      new_seq.append(c.replace('"', '').replace("'", ""))
      
      if label_i == len(labels):
        new_seq[i+1:] = seq[i+1:]
        break

    except:
      new_seq.append(c)

    char_c += word_len + 1

  
  return new_seq, label_seq


index = 180_000
res = label_seq(all_sentences[index], all_labels[index])

all_sentences[index], res[0], str(res[1]), len(res[1])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer(res[0], is_split_into_words=True).word_ids(), len(tokenizer(res[0], is_split_into_words=True).word_ids())

In [None]:
seq_labels = [label_seq(s, l) for s, l in zip(all_sentences, all_labels)]
sentences = [s for s, l in seq_labels]
labels = [l for s, l in seq_labels]

In [None]:
padded_labels = pad_sequences(labels, maxlen=500)
np.unique(padded_labels, return_counts=True)

In [None]:
train_seq_labels, test_seq_labels = train_test_split(seq_labels, test_size=0.1, stratify=[max(l) for l in labels])

In [None]:
df = pd.DataFrame(train_seq_labels, columns=['text', 'label'])
df.to_json("train_data.json", orient="records", lines=True)

In [None]:
df = pd.DataFrame(test_seq_labels, columns=['text', 'label'])
df.to_json("test_data.json", orient="records", lines=True)

In [None]:
dataset = load_dataset("json", data_files={"train": "train_data.json", "test": "test_data.json"})

In [None]:
dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            if label > 0:
              label = 2*label - 1
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label > 0:
              label *= 2
            
            new_labels.append(label)

    return new_labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["label"]
    new_labels = []

    for i, labels in enumerate(all_labels):
      word_ids = tokenized_inputs.word_ids(i)
      new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, load_from_cache_file=False)
tokenized_dataset = tokenized_dataset.remove_columns(['label', 'text'])

In [None]:
tokenized_dataset

In [None]:
dataset['train']['text'][90], dataset['train']['label'][90], tokenized_dataset['train']['input_ids'][90], tokenized_dataset['train']['labels'][90]

In [None]:
tokenizer(dataset['train']['text'][90], is_split_into_words=True).word_ids(), tokenizer.tokenize(dataset['train']['text'][90], is_split_into_words=True)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
seqeval = evaluate.load("seqeval")

In [None]:
base_classes = np.arange(0, 10)
classes = np.concatenate([base_classes * 2, base_classes[1:] * 2 - 1])
classes = np.sort(classes)
classes = list(map(int, classes))
classes

In [None]:
id2label = {
    0: 'O'
}
for i in range(1, len(classes), 2):
  if i < 13:
    pre = 'AP'
  else:
    pre = 'QU'
  id2label[classes[i]] = f'B-{pre}-{base_classes[i//2] + 1}'
  id2label[classes[i+1]] = f'I-{pre}-{base_classes[i//2] + 1}'


label2id = { v:k for k,v in id2label.items() }

In [None]:
id2label

In [None]:
label2id

In [None]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(id2label), id2label=id2label, label2id=label2id)
model

In [None]:
label_list = list(label2id.keys())

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    res = {}

    for c in base_classes[1:]:
      pre = 'AP' if c < 7 else 'QU'
      label_type = f"{pre}-{c}"
      res[f"{label_type}_f1"] = results[label_type]['f1']
      res[f"{label_type}_number"] = results[label_type]['number']
      res["overall_f1"] = results["overall_f1"]

    return res

In [None]:
train_path = '/content/drive/MyDrive/tr_project/bert_train/'

In [None]:
training_args = TrainingArguments(
    output_dir=train_path,
    learning_rate=3e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=2, 
    gradient_checkpointing=True,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
!nvidia-smi

In [None]:
ckpt_path = '/content/drive/MyDrive/tr_project/bert_train/checkpoint-'

In [None]:
trainer.train()