In [1]:
import argparse
import datasets
import pandas as pd
import transformers
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import numpy as np
import nlpaug.augmenter.word as naw

# Load the tokenizer from DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(examples):
    """Converts the text of each example to a sequence of integers
    representing token ids."""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# Define the augmentation techniques
aug_synonym = naw.SynonymAug(aug_src='wordnet')

def augment_text(text, augmenter, num_augmentations=1):
    augmented_texts = []
    for _ in range(num_augmentations):
        augmented = augmenter.augment(text)
        if isinstance(augmented, list):
            augmented = augmented[0]
        augmented_texts.append(augmented)
    return augmented_texts

def train(model_path="model", train_path="train.csv", dev_path="dev.csv"):
    # Load the CSVs
    train_df = pd.read_csv(train_path)

    # Augmentation
    augmented_rows = []
    for index, row in train_df.iterrows():
        if row['pride'] == 1 or row['relief'] == 1:
            augmented_texts = augment_text(row['text'], aug_synonym, num_augmentations=2)
            for aug_text in augmented_texts:
                new_row = row.copy()
                new_row['text'] = aug_text
                augmented_rows.append(new_row)

    # Create a DataFrame from the augmented rows and concatenate with the original data
    augmented_data = pd.DataFrame(augmented_rows)
    train_df = pd.concat([train_df, augmented_data], ignore_index=True)

    # Convert to Huggingface dataset
    hf_dataset = datasets.Dataset.from_pandas(train_df)
    dev_dataset = datasets.load_dataset("csv", data_files={"validation": dev_path})["validation"]

    # The labels are the names of all columns except the first
    labels = hf_dataset.column_names[1:]

    def gather_labels(example):
        """Converts the label columns into a list of 0s and 1s"""
        return {"labels": [float(example[l]) for l in labels]}

    # Convert text and labels 
    hf_dataset = hf_dataset.map(gather_labels)
    hf_dataset = hf_dataset.map(tokenize, batched=True)
    dev_dataset = dev_dataset.map(gather_labels)
    dev_dataset = dev_dataset.map(tokenize, batched=True)

    # Convert Huggingface datasets to Tensorflow datasets
    train_dataset = hf_dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols="labels",
        batch_size=16,
        shuffle=True)
    dev_dataset = dev_dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols="labels",
        batch_size=16)

    # Load DistilBERT model
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(labels))

    # Compile the model with hyperparameters
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metric = tf.keras.metrics.F1Score(average="micro", threshold=0.5)

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # Fit the model to the training data, monitoring F1 on the dev data
    model.fit(train_dataset, epochs=5, validation_data=dev_dataset)

    # Save the model
    model.save_pretrained(model_path)

def predict(model_path="model", input_path="dev.csv"):
    # Load the saved model
    model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

    # Load the data for prediction
    df = pd.read_csv(input_path)

    # Create input features in the same way as in train()
    hf_dataset = datasets.Dataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize, batched=True)
    tf_dataset = hf_dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        batch_size=16)

    # Generate predictions from model
    predictions = model.predict(tf_dataset).logits
    predictions = np.where(predictions > 0.5, 1, 0)

    # Assign predictions to label columns 
    df.iloc[:, 1:] = predictions

    # Write the Pandas dataframe to a zipped CSV file
    df.to_csv("submission.zip", index=False, compression=dict(
        method='zip', archive_name='submission.csv'))



In [2]:
train(model_path="model", train_path="train.csv", dev_path="dev.csv")

Found cached dataset csv (/Users/shalonwalter/.cache/huggingface/datasets/csv/default-e5e5576fb91a8fa9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/26814 [00:00<?, ? examples/s]

Map:   0%|          | 0/26814 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/shalonwalter/.cache/huggingface/datasets/csv/default-e5e5576fb91a8fa9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9b682cbdafb421b9.arrow
Loading cached processed dataset at /Users/shalonwalter/.cache/huggingface/datasets/csv/default-e5e5576fb91a8fa9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-fa936951b5cdb25f.arrow
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassif

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
