<a href="https://colab.research.google.com/github/victorjoseij/LLM/blob/main/2348570_LLMlab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U "tensorflow-text==2.13.*"



In [None]:
pip install "tf-models-official==2.13.*"

Collecting tf-models-official==2.13.*
  Using cached tf_models_official-2.13.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting sacrebleu (from tf-models-official==2.13.*)
  Using cached sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
Collecting seqeval (from tf-models-official==2.13.*)
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorflow-model-optimization>=0.4.1 (from tf-models-official==2.13.*)
  Downloading tensorflow_model_optimization-0.8.0-py2.py3-none-any.whl.metadata (904 bytes)
Collecting portalocker (from sacrebleu->tf-models-official==2.13.*)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu->tf-models-official==2.13.*)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading tf_models_official-2.13.2-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m15.9 MB/s[0m eta [

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')


In [None]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(3):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')

Review: b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label : 0 (neg)
Review: b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they

In [None]:
bert_preprocess_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1")
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.762629    0.99280983 -0.18611868  0.36673862  0.15233733  0.6550447
  0.9681154  -0.9486271   0.00216128 -0.9877732   0.06842692 -0.97630584]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946346  0.3432128   0.33231518 ...  0.21300825  0.7102068
  -0.05771117]
 [-0.28742072  0.31981036 -0.23018576 ...  0.58455    -0.21329743
   0.72692114]
 [-0.66157067  0.68876773 -0.8743301  ...  0.1087725  -0.26173177
   0.47855407]
 ...
 [-0.2256118  -0.2892561  -0.0706445  ...  0.47566038  0.83277136
   0.40025333]
 [-0.2982428  -0.27473134 -0.05450517 ...  0.48849747  1.0955354
   0.18163396]
 [-0.44378242  0.00930811  0.07223688 ...  0.1729009   1.1833243
   0.07898017]]


In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1", trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

tf.Tensor([[0.6469638]], shape=(1, 1), dtype=float32)
Epoch 1/5
 83/625 [==>...........................] - ETA: 1:25:34 - loss: 0.7142 - binary_accuracy: 0.5151

import pandas as pd
data=pd.read_csv('/content/ner.csv')
data.head()

texts = data['text'].apply(lambda x: x.split()).values
labels = data['labels'].apply(lambda x: x.split()).values

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

label_list = list(set([label for sublist in labels for label in sublist]))
label_map = {label: i for i, label in enumerate(label_list)}

encoded_labels = []
for label in labels:
  encoded_labels.append([label_map[l] for l in label])

import numpy as np
def preprocess_data(texts, labels):
    input_ids = []
    attention_masks = []
    label_ids = []

    for i in range(len(texts)):
        # Tokenize input texts
        tokenized_input = tokenizer(texts[i], is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
        input_ids.append(tokenized_input['input_ids'])
        attention_masks.append(tokenized_input['attention_mask'])

        # Align labels
        label_id = [-100] * len(tokenized_input['input_ids'])
        label_index = 0
        for j in range(1, len(tokenized_input['input_ids']) - 1):  # Skip [CLS] and [SEP]
            if tokenized_input['input_ids'][j] == tokenizer.pad_token_id:
                break
            if label_index < len(labels[i]):
                label_id[j] = labels[i][label_index]
                label_index += 1

        label_ids.append(label_id)

    return {
        'input_ids': np.array(input_ids),
        'attention_mask': np.array(attention_masks),
        'labels': np.array(label_ids)
    }

from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)
train_data = preprocess_data(train_texts, train_labels)
test_data = preprocess_data(test_texts, test_labels)

import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_data['input_ids'], 'attention_mask': train_data['attention_mask']},
    train_data['labels']
)).shuffle(1000).batch(8).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_data['input_ids'], 'attention_mask': test_data['attention_mask']},
    test_data['labels']
)).batch(8).prefetch(tf.data.AUTOTUNE)

from transformers import TFBertForTokenClassification
model = TFBertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.fit(train_dataset, epochs=3, validation_data=test_dataset)

def predict_ner(text):
    tokenized_input = tokenizer(text.split(), return_tensors="tf", truncation=True, padding=True, max_length=128, is_split_into_words=True)
    outputs = model(tokenized_input)
    predictions = tf.argmax(outputs.logits, axis=-1).numpy()[0]
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'].numpy()[0])
    predicted_labels = [label_list[pred] for pred in predictions]
    return list(zip(tokens, predicted_labels))

text = input("Enter text for NER: ")
ner_tags = predict_ner(text)
for word, label in ner_tags:
  print(f"{word}: {label}")