# BERT

## Import data

In [20]:
!! pip install --upgrade transformers
!! pip install tf-keras
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [21]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, f1_score

import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
df = pd.read_csv("Features_For_Traditional_ML_Techniques.csv", index_col=0)
subset_data = df.sample(frac=0.1, random_state=42)
texts = df['tweet'].values
labels = df['majority_target'].values

In [23]:
len(df)

134198

In [24]:
def prepare_dataset(texts, labels, tokenizer, batch_size=32, max_length=64):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        return_tensors='tf',
        max_length=max_length
    )

    # Create optimized dataset
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        },
        labels
    ))

    # Optimize performance
    dataset = dataset.cache()
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

In [25]:
# Initialize model
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=1
)

# Corrected optimizer variable
optimizer = Adam(learning_rate=2e-5)

# Compile model
model.compile(
    optimizer= optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy']
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train model

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    subset_data['tweet'].values,
    subset_data['majority_target'].values,
    test_size=0.2,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size = 0.2, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare datasets with optimized parameters
train_dataset = prepare_dataset(X_train, y_train, tokenizer)
val_dataset = prepare_dataset(X_val, y_val, tokenizer)
test_dataset = prepare_dataset(X_test, y_test, tokenizer)

In [27]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3,
        min_lr=1e-6
    )

# Train model
history = model.fit(
    train_dataset,
    epochs=8,
    validation_data=val_dataset,
    callbacks=[early_stopping, reduce_lr]
) 

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8


## Test model

In [28]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Test Loss: 0.4675, Test Accuracy: 0.8636


In [29]:
y_pred = model.predict(test_dataset)



In [30]:
logits = y_pred.logits  # This may vary based on your model type

# Check the shape of logits
print("Shape of logits:", logits.shape)

# Determine class labels based on the output shape
if len(logits.shape) == 1:  # Binary classification
    y_pred_classes = (logits > 0.5).astype(int).flatten()
else:  # Multiclass classification
    y_pred_classes = np.argmax(logits, axis=1)

Shape of logits: (2684, 1)


In [31]:
report = classification_report(y_test, y_pred_classes, output_dict=True)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Precision: {report["weighted avg"]["precision"]:.4f}')
print(f'Recall: {report["weighted avg"]["recall"]:.4f}')
print(f'F1 Score: {report["weighted avg"]["f1-score"]:.4f}')

Test Loss: 0.4675
Test Accuracy: 0.8636
Precision: 0.2411
Recall: 0.4911
F1 Score: 0.3234


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
model.save('bert_model_whole_dataset')

INFO:tensorflow:Assets written to: bert_model_whole_dataset\assets


INFO:tensorflow:Assets written to: bert_model_whole_dataset\assets


In [33]:
model.save_pretrained('bert_model_huggingface_whole_dataset')