In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
! pip install transformers
! pip install tokenizers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.2 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 59.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 480 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 73.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
import json
import os
import random
import numpy as np
import collections
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import transformers
from transformers import TFElectraModel, ElectraTokenizerFast

In [None]:
class DataElement:
    def __init__(self, question, context, answer, answer_start, answer_end):
        self.question = question
        self.context = context
        self.answer = answer
        self.answer_start = answer_start
        self.answer_end = answer_end

    def preprocess(self):
        # create context vector with answers marked
        context_vector = [0] * len(self.context)
        for index in range(self.answer_start, self.answer_end):
            context_vector[index] = 1
            
        # tokenize context   
        tokenized_context = tokenizer(self.context, return_offsets_mapping=True)
        context_offsets = tokenized_context['offset_mapping']

        # find answer token indices 
        answer_token_index = []
        for index, (start, end)  in enumerate(context_offsets):
            if sum(context_vector[start:end]) > 0: # if token is answer
                answer_token_index.append(index)
        
        if len(answer_token_index) == 0:
            return 0
        
        # start and end token index
        start_token_index = answer_token_index[0]
        end_token_index = answer_token_index[-1]
        
        # tokenize question
        tokenized_question = tokenizer(self.question, return_special_tokens_mask=True)

        # create inputs       
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:]
        token_type_ids = [0] * len(tokenized_context['input_ids']) + [1] * len(tokenized_question['input_ids'][1:])
            
        attention_mask = [1] * len(input_ids)
        
        # padding for equal lenght sequence
        padding_length = max_len - len(input_ids)
        if padding_length > 0: # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length) # len(input) [1] + padding [0]
            token_type_ids = token_type_ids + ([0] * padding_length) # context [0] + question [1] + padding [0]
        elif padding_length < 0:
            return 0
        
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_index = start_token_index
        self.end_token_index = end_token_index
        self.context_token_to_char = context_offsets
        return 1
            
      
    def class_print(self):
        print("Question: {}\nAnswer: {}\nAnswer Start: {}\nAnswer End: {}\nContext: {}".format(self.question, 
                                                                                              self.answer,  
                                                                                              self.answer_start, 
                                                                                              self.answer_end,
                                                                                              self.context))

In [None]:
def read_json(file_name):
    with open(file_name, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    return data

def json_to_list(json_dataset):
    dataset = []
    for paragraph_element in json_dataset["data"]:
        for question_element in paragraph_element["qas"]:
            dataset.append(DataElement(question_element["question"],
                                       paragraph_element["text"],
                                       question_element["answer"],
                                       question_element["answer_start"],
                                       question_element["answer_end"]))
    print("Number of questions: ", len(dataset))
    return dataset

def create_input_targets(dataset):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_index": [],
        "end_token_index": [],
    }
    i=0
    for item in dataset:
        # print(i)
        i = i + 1
        # print(item.class_print())
        for key in dataset_dict:
            dataset_dict[key].append(getattr(item, key))
            
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
        
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    
    y = [dataset_dict["start_token_index"], dataset_dict["end_token_index"]]
    return x, y

def find_max_length(dataset):
    max_ = 0
    index = 0
    i = 0
    for element in dataset:
        tokenized_question = tokenizer.encode(element.question)
        tokenized_context = tokenizer.encode(element.context)
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        
        if len(input_ids) > max_:
            max_ = len(input_ids)
            index = i
        i += 1
        
    print("Max length: {}, Index: {}".format(max_, index))
    return max_

def train_test_split(dataset):
    random.shuffle(dataset) 
    cut = int(len(dataset)*0.1)
    train, test = dataset[:-cut], dataset[-cut:] 
    
    return train, test

def create_model():
    ## Electra encoder
    encoder = TFElectraModel.from_pretrained(MODEL_NAME)
    encoder.save_pretrained(save_path+"/")


    # QA model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder.electra(input_ids, 
                             token_type_ids=token_type_ids, 
                             attention_mask=attention_mask)[0]
    
    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)
    
    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)
    
    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [None]:
import pickle

def save_data_as_file(data, file_name):
  with open(path + file_name + ".dat", "wb") as f:
    pickle.dump(data, f)

def read_saved_data(file_name):
  with open(path + file_name + ".dat", "rb") as f:
    data = pickle.load(f)
    return data

In [None]:
path = "/content/gdrive/MyDrive/Q&A projesi/"
models_path = path + "models/"
MODEL_NAME = "dbmdz/electra-base-turkish-cased-discriminator"
save_path = models_path+"electra-base-turkish-cased-discriminator"


In [None]:
tokenizer = ElectraTokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=False)

if not os.path.exists(save_path):
    os.makedirs(save_path)
    
tokenizer.save_pretrained(save_path)
#tokenizer = ElectraTokenizerFast.from_pretrained(save_path + "/vocab.txt", lowercase=False)

Downloading:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

('/content/gdrive/MyDrive/Q&A projesi/models/electra-base-turkish-cased-discriminator/tokenizer_config.json',
 '/content/gdrive/MyDrive/Q&A projesi/models/electra-base-turkish-cased-discriminator/special_tokens_map.json',
 '/content/gdrive/MyDrive/Q&A projesi/models/electra-base-turkish-cased-discriminator/vocab.txt',
 '/content/gdrive/MyDrive/Q&A projesi/models/electra-base-turkish-cased-discriminator/added_tokens.json',
 '/content/gdrive/MyDrive/Q&A projesi/models/electra-base-turkish-cased-discriminator/tokenizer.json')

In [None]:
#file_path = path + "json_dataset/Wiki_Dataset_Final.json"
file_path = path + "json_dataset/wiki_1050.json"
json_dataset = read_json(file_path)
#json_dataset["data"][144]["qas"][8]

In [None]:
raw_dataset = json_to_list(json_dataset)
raw_dataset[0].class_print()

Number of questions:  6766
Question: Türkiye'nin topraklarının büyük bölümü nerededir?
Answer: Anadolu
Answer Start: 69
Answer End: 76
Context: Türkiye Cumhuriyeti ya da kısaca Türkiye, topraklarının büyük bölümü Anadolu'da, küçük bir bölümü ise Balkan Yarımadası'nın güneydoğu uzantısı olan Trakya'da yer alan ülke. Kuzeybatıda Bulgaristan, batıda Yunanistan, kuzeydoğuda Gürcistan, doğuda Ermenistan, İran ve Azerbaycan'ın ekslav toprağı Nahçıvan, güneydoğuda ise Irak ve Suriye komşusudur. Güneyini Kıbrıs adası ve Akdeniz. Batısını Ege Denizi ve kuzeyini Karadeniz çevreler. Marmara Denizi ise İstanbul Boğazı ve Çanakkale Boğazı ile birlikte Anadolu'yu Trakya'dan yani Asya'yı Avrupa'dan ayırır. Türkiye, Avrupa ve Asya'nın kavşak noktasında yer alması sayesinde önemli bir jeostratejik güce sahiptir.


In [None]:
# max_len = find_max_length(raw_dataset)
max_len = 384
dataset = []
for data in raw_dataset:
    result = data.preprocess()
    if result != 0:
      dataset.append(data)
    
print("Dataset len: ", len(dataset))

Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors


Dataset len:  5212


In [None]:
train, test = train_test_split(dataset)

In [None]:
print(len(train))
print(len(test))

4691
521


In [None]:
save_data_as_file(test, "test_384_electra")
save_data_as_file(train, "train_384_electra")

In [None]:
max_len = 384
train = read_saved_data("train_" + str(max_len) + "_electra")
test = read_saved_data("test_" + str(max_len) + "_electra")

In [None]:
x_train, y_train = create_input_targets(train)
x_test, y_test = create_input_targets(test)

print(len(x_train[0]), len(x_test[0]))

4691 521


In [None]:
configuration = transformers.ElectraConfig()  # default parameters and configuration for ELECTRA

In [None]:
from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import is_running_in_gce  # pylint: disable=unused-import
from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import TPUClusterResolver
from tensorflow.python.util.tf_export import tf_export

In [None]:
tpu_address=""
def check_tpu_statue():
    import os
    
    if 'COLAB_TPU_ADDR' not in os.environ:
      print('ERROR: Not connected to a TPU runtime')
    else:
      tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
      print ('TPU address is', tpu_address)

check_tpu_statue()
# output: TPU address is grpc://10.70.191.234:8470

TPU address is grpc://10.16.88.34:8470


In [None]:

use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_address)
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

In [None]:
print(x_train)

[array([[    2, 11688,  2231, ...,     0,     0,     0],
       [    2,  4542, 13816, ...,     0,     0,     0],
       [    2,  4557,  5477, ...,     0,     0,     0],
       ...,
       [    2,  3836,  1030, ...,     0,     0,     0],
       [    2, 27343,  4691, ...,     0,     0,     0],
       [    2,  9704,  1096, ...,     0,     0,     0]]), array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])]


In [None]:
# Load Weights from Drive
# model.load_weights(path + "models/albertV1_weights.h5")
BATCH_VALUE=128
EPOCH_VALUE=10
model.fit(
    x_train,
    y_train,
    epochs=EPOCH_VALUE, # use 3 or 5
    verbose=2,
    batch_size=BATCH_VALUE,
)

Epoch 1/10


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=int64>, <tf.Tensor 'cond/Identity_32:0' shape=(None,) dtype=int64>]


Instructions for updating:
use `experimental_local_results` instead.


Instructions for updating:
use `experimental_local_results` instead.
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=int64>, <tf.Tensor 'cond/Identity_32:0' shape=(None,) dtype=int64>]


37/37 - 150s - loss: 5.7674 - activation_loss: 2.6165 - activation_1_loss: 3.1508 - 150s/epoch - 4s/step
Epoch 2/10
37/37 - 12s - loss: 1.5903 - activation_loss: 0.6117 - activation_1_loss: 0.9785 - 12s/epoch - 318ms/step
Epoch 3/10
37/37 - 12s - loss: 0.9525 - activation_loss: 0.3184 - activation_1_loss: 0.6342 - 12s/epoch - 318ms/step
Epoch 4/10
37/37 - 12s - loss: 0.6107 - activation_loss: 0.1836 - activation_1_loss: 0.4271 - 12s/epoch - 319ms/step
Epoch 5/10
37/37 - 12s - loss: 0.4065 - activation_loss: 0.1069 - activation_1_loss: 0.2996 - 12s/epoch - 318ms/step
Epoch 6/10
37/37 - 12s - loss: 0.2969 - activation_loss: 0.0925 - activation_1_loss: 0.2043 - 12s/epoch - 318ms/step
Epoch 7/10
37/37 - 12s - loss: 0.2189 - activation_loss: 0.0652 - activation_1_loss: 0.1537 - 12s/epoch - 318ms/step
Epoch 8/10
37/37 - 12s - loss: 0.1651 - activation_loss: 0.0468 - activation_1_loss: 0.1183 - 12s/epoch - 318ms/step
Epoch 9/10
37/37 - 12s - loss: 0.0979 - activation_loss: 0.0286 - activation

<keras.callbacks.History at 0x7fbd3f431850>

In [None]:
print(len(x_test[0]))
print(len(x_train[0]))

521
4691


In [None]:
pred_start, pred_end = model.predict(x_test)
count = 0
results = []
total_f1 = 0
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
  element = test[idx]
  offsets = element.context_token_to_char
  start = np.argmax(start)
  end = np.argmax(end)

  if start >= len(offsets):
    continue

  pred_char_start = offsets[start][0]

  if end < len(offsets):
    pred_char_end = offsets[end][1]
    pred_ans = element.context[pred_char_start:pred_char_end]
  else:
    pred_ans = element.context[pred_char_start:]

  pred_tokens = pred_ans.split()
  true_tokens = element.answer.split()
  common = collections.Counter(true_tokens) & collections.Counter(pred_tokens)
  num_same = sum(common.values())

  if len(true_tokens) == 0 or len(pred_tokens) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    f1 =  int(true_tokens == pred_tokens)
  elif num_same == 0:
    f1 =  0
  else:
    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(true_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
  total_f1 += f1

  results.append({
      "question": element.question,
      "true answer": element.answer,
      "predicted answer": pred_ans,
      "context": element.context,
      "f1 score": f1,
  })

  # print(f"Question: {element.question}")
  # print(f"Prediction: {pred_ans}\nTrue Answer: {element.answer}")
  # print(f"Context: {element.context}")
  # print("\n")
  if pred_ans == element.answer:
    count += 1

acc = count / len(y_test[0])
F1 = total_f1 / len(y_test[0])

# print(f"exact match:={acc:.2f} f1:={F1:.2f}")

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 384) dtype=int64>]


In [None]:
print("ELECTRA MODEL\n Batch Size:"+ str(BATCH_VALUE)+"\n Epoch:+"+str(EPOCH_VALUE)+f"\n exact match:={acc:.2f} f1:={F1:.2f}")

ELECTRA MODEL
 Batch Size:128
 Epoch:+10
 exact match:=0.66 f1:=0.83


In [None]:
# for item in results:
#   print(item["question"])
#   print(item["true answer"])
#   print(item["predicted answer"])
#   #print(item["context"])
#   print(item["f1 score"])

In [None]:
with open(save_path + "/results/"  + "_10epoch_result.txt", "w") as f:
  for result in results:
    f.write('%s\n' %result)

model.save_weights(save_path + "/weights/" + "electra-base-turkish-cased-discriminator" + "_seqlen512_batch64_epochs10_weights.h5")

In [None]:
print(save_path)

/content/gdrive/MyDrive/Q&A projesi/models/electra-base-turkish-cased-discriminator
