In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel # Import TensorFlow specific model
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
import sys
import time # To measure encoding time

# Add path to your dataloader
sys.path.append('../dataloader')
from dataloader import daigtv2_loader # Assuming this function loads your data into a pandas DataFrame
sys.path.append('../part3')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Configuration ---
path_to_folder = "../../" # Adjust if your dataloader needs a different path
TEST_SIZE = 0.2 # Percentage of data to use for testing
RANDOM_STATE = 42 # For reproducible splits
TRANSFORMER_MODEL_NAME = 'bert-base-uncased' # Using a standard BERT model
BATCH_SIZE_ENCODING = 32 # Batch size for processing text through the transformer (adjust based on GPU memory)
# TensorFlow automatically uses GPU if available and configured

# --- 1. Load your data ---
print("Loading data...")
try:
    df = daigtv2_loader(path_to_folder)
    texts = df['text'].values
    labels = df['label'].values
    print(f"Data loaded. Found {len(texts)} texts.")
except Exception as e:
    print(f"Error loading data: {e}")
    sys.exit("Exiting due to data loading error.")

Loading data...
Data loaded. Found 44868 texts.


In [4]:

# --- 2. Split data ---
print(f"Splitting data into train and test sets ({1-TEST_SIZE:.0%} train, {TEST_SIZE:.0%} test)...")
X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=labels # Stratify to maintain label distribution
)
print(f"Train set size: {len(X_train_text)}")
print(f"Test set size: {len(X_test_text)}")


Splitting data into train and test sets (80% train, 20% test)...
Train set size: 35894
Test set size: 8974


In [5]:
# --- 3. Load Pre-trained Transformer Tokenizer and TensorFlow Model ---
print(f"\nLoading pre-trained TensorFlow Transformer model: {TRANSFORMER_MODEL_NAME}")
try:
    tokenizer = BertTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)
    # Use TFBertModel for the TensorFlow version
    model_transformer = TFBertModel.from_pretrained(TRANSFORMER_MODEL_NAME)
    print("TensorFlow Transformer model and tokenizer loaded successfully.")
    # TensorFlow automatically manages device placement (GPU/CPU)
    print(f"TensorFlow version: {tf.__version__}")
    print(f"Num GPUs Available: {len(tf.config.list_physical_devices('GPU'))}")
except Exception as e:
    print(f"Error loading TensorFlow Transformer model or tokenizer: {e}")
    sys.exit("Exiting due to Transformer loading error.")


Loading pre-trained TensorFlow Transformer model: bert-base-uncased


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

TensorFlow Transformer model and tokenizer loaded successfully.
TensorFlow version: 2.10.1
Num GPUs Available: 1


In [6]:
# --- 4. Define Text Encoding Function (TensorFlow) ---
def encode_texts_tf(texts, tokenizer, model, max_length=512, batch_size=32):
    """
    Encodes a list of texts into fixed-size vectors using a TensorFlow Transformer model.
    Uses the [CLS] token embedding as the sentence representation.
    """
    embeddings = []
    num_texts = len(texts)

    print(f"Encoding {num_texts} texts in batches of {batch_size} using TensorFlow...")
    start_time = time.time()

    # Process texts in batches
    for i in range(0, num_texts, batch_size):
        batch_texts = texts[i : i + batch_size]

        # Tokenize the batch, returning TensorFlow tensors
        # Explicitly convert NumPy array slice to a Python list
        encoded_input = tokenizer(
            batch_texts.tolist(), # <--- FIX: Convert NumPy slice to list
            padding=True,          # Pad sequences to the longest in the batch
            truncation=True,       # Truncate sequences longer than max_length
            max_length=max_length, # Maximum length of sequences
            return_tensors='tf'    # Return TensorFlow tensors
        )

        # Get embeddings from the model
        # No need for torch.no_grad(), TensorFlow handles this in inference mode
        outputs = model(encoded_input)

        # Extract the embedding of the [CLS] token (usually the first token)
        # outputs.last_hidden_state has shape (batch_size, sequence_length, hidden_size)
        # We take the first token ([CLS]) for all items in the batch: outputs.last_hidden_state[:, 0, :]
        # Convert TensorFlow tensor to NumPy array
        cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

        embeddings.append(cls_embeddings)

        # Print progress
        if (i + batch_size) % (batch_size * 10) == 0: # Print progress every 10 batches
             print(f"Encoded {i + batch_size}/{num_texts} texts...")


    # Concatenate embeddings from all batches
    all_embeddings = np.vstack(embeddings)

    end_time = time.time()
    print(f"Encoding complete. Total time: {end_time - start_time:.2f} seconds.")
    print(f"Shape of generated embeddings: {all_embeddings.shape}")

    return all_embeddings


In [7]:

# --- 5. Encode Training and Test Data ---
# BERT models typically have a max sequence length of 512 tokens
BERT_MAX_LEN = 512

X_train_encoded = encode_texts_tf(
    X_train_text,
    tokenizer,
    model_transformer,
    max_length=BERT_MAX_LEN,
    batch_size=BATCH_SIZE_ENCODING
)

X_test_encoded = encode_texts_tf(
    X_test_text,
    tokenizer,
    model_transformer,
    max_length=BERT_MAX_LEN,
    batch_size=BATCH_SIZE_ENCODING
)


Encoding 35894 texts in batches of 32 using TensorFlow...
Encoded 320/35894 texts...
Encoded 640/35894 texts...
Encoded 960/35894 texts...
Encoded 1280/35894 texts...
Encoded 1600/35894 texts...
Encoded 1920/35894 texts...
Encoded 2240/35894 texts...
Encoded 2560/35894 texts...
Encoded 2880/35894 texts...
Encoded 3200/35894 texts...
Encoded 3520/35894 texts...
Encoded 3840/35894 texts...
Encoded 4160/35894 texts...
Encoded 4480/35894 texts...
Encoded 4800/35894 texts...
Encoded 5120/35894 texts...
Encoded 5440/35894 texts...
Encoded 5760/35894 texts...
Encoded 6080/35894 texts...
Encoded 6400/35894 texts...
Encoded 6720/35894 texts...
Encoded 7040/35894 texts...
Encoded 7360/35894 texts...
Encoded 7680/35894 texts...
Encoded 8000/35894 texts...
Encoded 8320/35894 texts...
Encoded 8640/35894 texts...
Encoded 8960/35894 texts...
Encoded 9280/35894 texts...
Encoded 9600/35894 texts...
Encoded 9920/35894 texts...
Encoded 10240/35894 texts...
Encoded 10560/35894 texts...
Encoded 10880/35894

In [8]:
# --- 6. Train CatBoost Classifier ---
print("\nTraining CatBoost Classifier...")

# CatBoost works well with default parameters, but you can tune them
catboost_model = CatBoostClassifier(
    iterations=1000, # Number of boosting iterations (trees)
    learning_rate=0.05,
    loss_function='Logloss', # For binary classification
    eval_metric='Accuracy',
    random_seed=RANDOM_STATE,
    verbose=100, # Print progress every 100 iterations
    early_stopping_rounds=50 # Stop if validation metric doesn't improve for 50 rounds
)

# Create CatBoost Pool objects (optional but can be useful)
# Features are the encoded embeddings (numerical)
# Labels are the target labels
train_pool = Pool(data=X_train_encoded, label=y_train)
test_pool = Pool(data=X_test_encoded, label=y_test) # Use test set as validation set

# Train the model
start_time = time.time()
catboost_model.fit(train_pool, eval_set=test_pool)
end_time = time.time()

print(f"CatBoost training complete. Total time: {end_time - start_time:.2f} seconds.")


Training CatBoost Classifier...
0:	learn: 0.9227726	test: 0.9174281	best: 0.9174281 (0)	total: 379ms	remaining: 6m 18s
100:	learn: 0.9848164	test: 0.9818364	best: 0.9818364 (100)	total: 15.4s	remaining: 2m 17s
200:	learn: 0.9905277	test: 0.9857366	best: 0.9857366 (196)	total: 28.9s	remaining: 1m 54s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9865166035
bestIteration = 235

Shrink model to first 236 iterations.
CatBoost training complete. Total time: 40.94 seconds.


In [9]:
# --- 7. Evaluate CatBoost Model ---
print("\nEvaluating CatBoost Model on the test set...")

# Predict class labels (0 or 1)
y_pred = catboost_model.predict(X_test_encoded)

# Predict probabilities (if needed)
# y_pred_proba = catboost_model.predict_proba(X_test_encoded)[:, 1] # Get probability for the positive class

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Print Classification Report (includes precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nMethod 3 (TensorFlow) execution finished.")



Evaluating CatBoost Model on the test set...
Test Accuracy: 0.9865

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5474
           1       0.98      0.98      0.98      3500

    accuracy                           0.99      8974
   macro avg       0.99      0.99      0.99      8974
weighted avg       0.99      0.99      0.99      8974


Method 3 (TensorFlow) execution finished.


In [12]:
import os
import json
# --- 8. Save Model Components ---
print("\nSaving model components...")

# Define directories/paths for saving
save_dir_transformer = './saved_transformer_model_tf' # Added _tf to distinguish from potential PyTorch saves
save_dir_tokenizer = './saved_transformer_tokenizer_tf' # Added _tf
save_path_catboost = './saved_catboost_model_tf.cbm' # Added _tf
save_path_config = './model_config_tf.json' # Added _tf

# Create directories if they don't exist
os.makedirs(save_dir_transformer, exist_ok=True)
os.makedirs(save_dir_tokenizer, exist_ok=True)

# Save the Transformer model (TensorFlow version)
try:
    model_transformer.save_pretrained(save_dir_transformer)
    print(f"TensorFlow Transformer model saved to: {save_dir_transformer}")
except Exception as e:
    print(f"Error saving TensorFlow Transformer model: {e}")

# Save the Transformer tokenizer
try:
    tokenizer.save_pretrained(save_dir_tokenizer)
    print(f"Transformer tokenizer saved to: {save_dir_tokenizer}")
except Exception as e:
    print(f"Error saving Transformer tokenizer: {e}")

# Save the CatBoost model
try:
    catboost_model.save_model(save_path_catboost)
    print(f"CatBoost model saved to: {save_path_catboost}")
except Exception as e:
    print(f"Error saving CatBoost model: {e}")

# Save configuration needed for loading
config_data = {
    'transformer_model_name': TRANSFORMER_MODEL_NAME,
    'max_length': BERT_MAX_LEN, # Using BERT_MAX_LEN which is MAX_LEN for BERT
    'transformer_model_dir': save_dir_transformer, # Store the saved directory paths
    'transformer_tokenizer_dir': save_dir_tokenizer,
    'catboost_model_path': save_path_catboost
}
try:
    with open(save_path_config, 'w') as f:
        json.dump(config_data, f, indent=4)
    print(f"Model configuration saved to: {save_path_config}")
except Exception as e:
    print(f"Error saving configuration: {e}")

print("Saving complete.")


Saving model components...
TensorFlow Transformer model saved to: ./saved_transformer_model_tf
Transformer tokenizer saved to: ./saved_transformer_tokenizer_tf
CatBoost model saved to: ./saved_catboost_model_tf.cbm
Model configuration saved to: ./model_config_tf.json
Saving complete.


In [30]:

#--- How to use for new predictions ---
#Assuming you have a list of new texts called `new_texts`
if tokenizer and model_transformer and catboost_model:
    new_texts = np.array(["I personally have a different argument as to why the electoral college is a good thing. My argument is more about trying to refocus what the office of the President is supposed to be about - the representative of the many states for foreign relations and treaties and as the check on Congress.\n\nThe president continues to be regarded as the \"most powerful person in the world\" which elevates the office to almost monarch-like reverence. The problem with this is it excludes Congress as to where the power is and should be. The notion to change to popular vote gives even more power to what should be a weak executive. A strong(er) executive opens the door to more authoritarian figures to just simply appeal to a cult of personality.\n\nSo, the EC is the states voting for their representative. This is it's true purpose and why it should continue IMO.",
    "Dear Senator,\n\nI am writing to you today to express my strong support for abolishing the Electoral College and electing the President by popular vote. I believe that this is the only way to ensure that every American's vote counts and that our elections are truly representative of the will of the people.\n\nThe Electoral College is a system that was devised over 200 years ago, when the United States was a very different country. At the time, it was believed that the Electoral College would help to protect the interests of smaller states against the larger states. However, the Electoral College has become increasingly outdated and irrelevant in the 21st century.\n\nOne of the biggest problems with the Electoral College is that it gives too much power to a small number of states. In the 2016 election, for example, Donald Trump won the Electoral College despite losing the popular vote by nearly three million votes. This is because Trump won a majority of the electoral votes in a small number of swing states, such as Pennsylvania, Michigan, and Wisconsin.\n\nThis system is unfair to the voters in the states that Trump lost. Their votes were essentially ignored, and they had no say in who became President. This is not how a democracy should work.\n\nAnother problem with the Electoral College is that it encourages candidates to focus on a small number of swing states. In the 2016 election, for example, Trump spent very little time campaigning in states that he was sure to win, such as California and Texas. Instead, he focused all of his attention on the swing states, where he knew that the election would be decided.\n\nThis is not a good way to run a presidential election. Candidates should be campaigning all over the country, not just in a handful of swing states. This is the only way to ensure that all Americans have a voice in the election.\n\nI urge you to support legislation that would abolish the Electoral College and elect the President by popular vote. This is the only way to ensure that our elections are truly fair and representative of the will of the people.\n\nThank you for your time and consideration.\n\nSincerely,\n\n[Your Name]"])
    # Use the TensorFlow encoding function
    new_encoded_X = encode_texts_tf(new_texts, tokenizer, model_transformer, max_length=BERT_MAX_LEN, batch_size=BATCH_SIZE_ENCODING)
    new_predictions = catboost_model.predict(new_encoded_X)
    print("\nPredictions for new texts (0 or 1):")
    print(new_predictions)



Encoding 2 texts in batches of 32 using TensorFlow...
Encoding complete. Total time: 0.44 seconds.
Shape of generated embeddings: (2, 768)

Predictions for new texts (0 or 1):
[0 1]


In [20]:
X_train_text.shape

(35894,)