In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

from sklearn.model_selection import train_test_split
import pandas as pd

import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OUTPUT_DIR = 'output'
VAL_RATIO = 0.3
DATA_COLUMNS = ['title', 'lemma', 'pos', 'tag', 'dep', 'label', 'trigger_words', 'context_score']
LABEL_COLUMN = 'category'
LABEL_LIST = []
DATASET_DIR = 'data/test/output_with_category.csv'

## Load Dataset

In [3]:
# dataset headers title,lemma,pos,tag,dep,label,trigger_words,context_score,category
def load_dataset(file_location):
    # populate label list
    with open(file_location, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            if row[6] not in LABEL_LIST:
                LABEL_LIST.append(row[6])
            
    df = pd.read_csv(file_location, usecols=DATA_COLUMNS.append(LABEL_COLUMN))

    return df

In [5]:
label_list = list(range(0, len(LABEL_LIST)))
data_df = load_dataset(DATASET_DIR)
train_data, test_data, train_labels, test_labels = train_test_split(data_df, data_df[LABEL_COLUMN], test_size=VAL_RATIO, random_state=42)

In [6]:
data_df.head()

Unnamed: 0,title,lemma,pos,tag,dep,label,trigger_words,context_score,category
0,2006 Pangandaran earthquake and tsunami,"['2006', 'Pangandaran', 'earthquake', 'and', '...","['NUM', 'PROPN', 'NOUN', 'CCONJ', 'NOUN']","['CD', 'NNP', 'NN', 'CC', 'NN']","['nummod', 'compound', 'ROOT', 'cc', 'conj']","['DATE', 'GPE', '', '', '']","['pangandaran', 'earthquake', 'tsunami']",{'earthquake': 1.0},geological_phenomenon
1,Battle of Santa Clara (1927),"['battle', 'of', 'Santa', 'Clara', '1927']","['NOUN', 'ADP', 'PROPN', 'PROPN', 'NUM']","['NN', 'IN', 'NNP', 'NNP', 'CD']","['ROOT', 'prep', 'compound', 'pobj', 'npadvmod']","['', '', 'GPE', 'GPE', 'DATE']","['battle', 'santa', 'clara']",{'battle': 1.0},military_action
2,Siege of Pondicherry (1793),"['siege', 'of', 'Pondicherry', '1793']","['NOUN', 'ADP', 'PROPN', 'NUM']","['NN', 'IN', 'NNP', 'CD']","['ROOT', 'prep', 'pobj', 'npadvmod']","['', '', '', 'DATE']","['siege', 'pondicherry']",{'siege': 1.0},military_action
3,Battle of Leuthen,"['battle', 'of', 'Leuthen']","['NOUN', 'ADP', 'PROPN']","['NN', 'IN', 'NNP']","['ROOT', 'prep', 'pobj']","['', '', '']","['battle', 'leuthen']",{'battle': 1.0},military_action
4,Glasgow St Enoch rail accident,"['Glasgow', 'St', 'Enoch', 'rail', 'accident']","['PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN']","['NNP', 'NNP', 'NNP', 'NN', 'NN']","['compound', 'compound', 'compound', 'compound...","['PERSON', 'PERSON', 'PERSON', '', '']","['glasgow', 'st', 'enoch', 'rail', 'accident']","{'rail': 1.0, 'accident': 0.7565370481203059}",bar


In [7]:
label_list

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [8]:
# Tokenize data
def tokenize_data(tokenizer, sentences, labels, max_length):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens = True,
                            max_length = max_length,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels)

In [9]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list), output_attentions=False, output_hidden_states=False)

In [None]:
# Tokenize and prepare data loaders
train_dataset = tokenize_data(tokenizer, train_data, train_labels, max_length=128)
val_dataset = tokenize_data(tokenizer, test_data, test_labels, max_length=128)

In [40]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

max_seq_length = 128
epochs = 3
batch_size = 32

# Load the pre-trained BERT model from TensorFlow Hub
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

# Define the model architecture for fine-tuning
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

pooled_output, sequence_output = bert_model([input_word_ids, input_mask, segment_ids])

# Add custom layers for classification
# For example, you can add a Dense layer with softmax activation for multi-class classification
num_classes = len(LABEL_LIST)
output = tf.keras.layers.Dense(num_classes, activation='softmax')(pooled_output)

# Create the model
model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output)

# Compile the model with appropriate loss function and optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Tokenize your input data and convert labels to one-hot encoding if necessary
# X_train, X_test, y_train, y_test = ...

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test)
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

# Make predictions
predictions = model.predict(X_test)


ImportError: dlopen(/Users/kenny/anaconda3/envs/tensorflow-env/lib/python3.10/site-packages/tensorflow_text/core/pybinds/tflite_registrar.so, 0x0002): Symbol not found: __ZN4absl12lts_2023080210CHexEscapeENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE
  Referenced from: <E5449F0F-CDE4-384F-B7F9-E7BD33907F26> /Users/kenny/anaconda3/envs/tensorflow-env/lib/python3.10/site-packages/tensorflow_text/core/pybinds/tflite_registrar.so
  Expected in:     <B3F8A302-57B4-346D-A999-24F928D29E51> /Users/kenny/anaconda3/envs/tensorflow-env/lib/python3.10/site-packages/tensorflow/libtensorflow_framework.2.dylib