In [12]:
import pandas as pd
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from transformers import set_seed
import tensorflow as tf
from tqdm import tqdm

In [3]:
train_df = pd.read_csv("../data/train.csv")
val_df = pd.read_csv("../data/validation.csv")
test_df = pd.read_csv("../data/test.csv")

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [5]:
def tokenize(sentences, max_length=100, padding='max_length'):
    return tokenizer(
        sentences,
        truncation=True,
        padding=padding,
        max_length=max_length,
        return_tensors="tf" 
    )

In [6]:
bert_x_train = train_df["Comment_Adj"].tolist()
bert_y_train = train_df["Result_Bin"].tolist()
bert_x_val = val_df["Comment_Adj"].tolist()
bert_y_val = val_df["Result_Bin"].tolist()

In [7]:
train_encodings = tokenize(bert_x_train)
val_encodings = tokenize(bert_x_val)

In [8]:
train_labels = tf.convert_to_tensor(bert_y_train, dtype=tf.int32)
val_labels = tf.convert_to_tensor(bert_y_val, dtype=tf.int32)

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),  
    train_labels
)).shuffle(1000).batch(50).prefetch(1)

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),  
    val_labels
)).batch(50).prefetch(1)


In [10]:
seed_value = 42
set_seed(seed_value)

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

model.fit(
    x=train_dataset,
    y=None,
    validation_data=validation_dataset,
    batch_size=50,
    epochs=2
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 