In [1]:
!! pip install --upgrade transformers
!! pip install tf-keras
! pip install optuna
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.0.0


In [25]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizer, TFRobertaForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from tqdm import tqdm
import warnings
!! pip install emoji
from TweetNormalizer import normalizeTweet
import optuna
warnings.filterwarnings('ignore')

### Reading in the Dataset

In [3]:
df = pd.read_csv("Features_For_Traditional_ML_Techniques.csv", index_col=0)
all_tweets = df['tweet']
# subset_data = df.sample(frac=0.1, random_state=42)

### Preprocessing the dataset

In [4]:
texts_normalised = list(map(normalizeTweet, all_tweets))

### Preparing the dataset for training

In [10]:
def prepare_dataset(texts, labels, tokenizer, shuffle, batch_size=32, max_length=64):
    # Convert texts to list if it's numpy array
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()

    # Ensure labels are the right shape
    labels = np.array(labels)
    if len(labels.shape) == 1:
        labels = labels.reshape(-1, 1)

    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        return_tensors='tf',
        max_length=max_length,
        add_special_tokens=True
    )

    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        },
        labels
    ))
    dataset = dataset.cache()
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=50000)
    return dataset

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    texts_normalised,
    df['majority_target'].astype(int),
    test_size=0.2,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size = 0.2,
    random_state=42
)

# Initialize tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

In [12]:
from collections import Counter
Counter(y_train)

Counter({1: 44166, 0: 41720})

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Testing on a subset

In [39]:
from transformers import TFRobertaForSequenceClassification
model = TFRobertaForSequenceClassification.from_pretrained(
        "vinai/bertweet-base",
        num_labels=1
    )

# model = TFBertForSequenceClassification.from_pretrained(
#         'bert-base-uncased',
#         num_labels=1
#     )

# Compile model
optimizer = Adam(learning_rate=2e-5)
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy']
)


history = model.fit(
      train_dataset,
      epochs=3,
      validation_data=val_dataset,
      verbose=1
  )

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [40]:
y_pred = model.predict(test_dataset)
logits = y_pred.logits
prob = tf.sigmoid(logits)
prob = prob.numpy()
y_pred_classes = (prob > 0.5).astype(int).flatten()



In [47]:
logits_class = (logits > 0.5).astype(int).flatten()

In [49]:
model.evaluate(test_dataset)



[0.4837358593940735, 0.8181818127632141]

### Hyperparameter tuning on all tweets

In [23]:
def objective(trial, X_train, y_train, X_val, y_val, X_test, y_test, tokenizer):
    """Optuna objective function to minimize"""

    # Get hyperparameters for this trial
    params = {
        'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
        'max_length': trial.suggest_categorical('max_length', [128]),
        'learning_rate': trial.suggest_categorical('learning_rate', [2e-5, 5e-6]),
        # 'lr_reduction_factor': trial.suggest_float('lr_reduction_factor', 0.1, 0.5),
        'epochs': 8  # Fixed number of epochs
    }

    # Prepare datasets
    train_dataset = prepare_dataset(
        X_train,
        y_train,
        tokenizer,
        True,
        batch_size=params['batch_size'],
        max_length=params['max_length']
    )
    val_dataset = prepare_dataset(
        X_val,
        y_val,
        tokenizer,
        False,
        batch_size=params['batch_size'],
        max_length=params['max_length']
    )
    test_dataset = prepare_dataset(
        X_test,
        y_test,
        tokenizer,
        False,
        batch_size=params['batch_size'],
        max_length=params['max_length']
    )

    # Initialize model
    model = TFRobertaForSequenceClassification.from_pretrained(
        "vinai/bertweet-base",
        num_labels=1
    )

    # Compile model
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=4,
        restore_best_weights=True
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3,
        min_lr=1e-6
    )

    # Train model
    history = model.fit(
        train_dataset,
        epochs=params['epochs'],
        validation_data=val_dataset,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    model.save_pretrained('bert-tweet-test-model')
    tokenizer.save_pretrained('best-tweet-test-tokeniser')

    # Evaluate model
    test_loss, test_accuracy = model.evaluate(test_dataset, verbose=1)
    y_pred = model.predict(test_dataset)
    logits = y_pred.logits
    y_pred_classes = (logits > 0.5).astype(int).flatten()

    # Get metrics
    report = classification_report(y_test, y_pred_classes, output_dict=True)
    print(report)

    # Store trial results
    trial.set_user_attr('test_loss', test_loss)
    trial.set_user_attr('test_accuracy', test_accuracy)
    trial.set_user_attr('precision', report["weighted avg"]["precision"])
    trial.set_user_attr('recall', report["weighted avg"]["recall"])
    trial.set_user_attr('f1_score', report["weighted avg"]["f1-score"])
    trial.set_user_attr('best_epoch', len(history.history['loss']))
    trial.set_user_attr('min_val_loss', min(history.history['val_loss']))

    return test_accuracy

In [19]:
def run_optimization(X_train, y_train, X_val, y_val, X_test, y_test, n_trials=20):

    # Create study object
    study = optuna.create_study(direction='maximize')

    # Run optimization
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, X_test, y_test, tokenizer),
                  n_trials=n_trials)

    # Collect results
    results = []
    for trial in study.trials:
        if trial.state == optuna.trial.TrialState.COMPLETE:
            result = {
                'batch_size': trial.params['batch_size'],
                'max_length': trial.params['max_length'],
                'learning_rate': trial.params['learning_rate'],
                # 'lr_reduction_factor': trial.params['lr_reduction_factor'],
                'test_loss': trial.user_attrs['test_loss'],
                'test_accuracy': trial.user_attrs['test_accuracy'],
                'precision': trial.user_attrs['precision'],
                'recall': trial.user_attrs['recall'],
                'f1_score': trial.user_attrs['f1_score'],
                'best_epoch': trial.user_attrs['best_epoch'],
                'min_val_loss': trial.user_attrs['min_val_loss']
            }
            results.append(result)

    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('f1_score', ascending=False)

    # Save results
    results_df.to_csv('optuna_hyperparameter_results.csv', index=False)

    # Print best trial information
    print("\nBest trial:")
    trial = study.best_trial
    print(f"Value (F1 Score): {trial.value:.4f}")
    print("\nBest hyperparameters:")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

    # Create visualizations
    try:
        import plotly
        fig = optuna.visualization.plot_optimization_history(study)
        fig.write_html("optimization_history.html")

        fig_importance = optuna.visualization.plot_param_importances(study)
        fig_importance.write_html("parameter_importance.html")
    except Exception as e:
        print(f"Could not generate plots: {str(e)}")

    return results_df, study

In [26]:
# Run the optimization
results_df, study = run_optimization(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    X_test=X_test,
    y_test=y_test,
    n_trials=5
)

# Display top 5 results
print("\nTop 5 configurations:")
print(results_df.head().to_string())

# Access best parameters
best_params = study.best_params
print("\nBest parameters:", best_params)

# Access best score
best_score = study.best_value
print("Best F1 score:", best_score)

[I 2024-11-03 15:11:58,596] A new study created in memory with name: no-name-67cad6f9-3c35-41e8-bc4f-6612d74ecdbc


tf_model.h5:   0%|          | 0.00/740M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
   5/2684 [..............................] - ETA: 10:03:38 - loss: 1.4153 - accuracy: 0.5250

[W 2024-11-03 15:14:53,019] Trial 0 failed with parameters: {'batch_size': 32, 'max_length': 128, 'learning_rate': 5e-06} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_11080/3189567717.py", line 7, in <lambda>
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, X_test, y_test, tokenizer),
  File "/tmp/ipykernel_11080/1851001500.py", line 67, in objective
    history = model.fit(
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1229, in fit
    return super().fit(*args, **kwargs)
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
    return fn(*args, **kw

KeyboardInterrupt: 