In [9]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from transformers import TFBertModel
import transformers

In [31]:
# Configuration
BATCH_SIZE = 16
MAX_LEN = 256 
DROPOUT = 0.1 # 0.2
LEARNING_RATE = 1e-5
EPOCHS = 1#8
AUTO = tf.data.experimental.AUTOTUNE
MODEL = "bert-base-cased"
SEED = 42

In [5]:
import wandb
# Initialize a W&B run for logging
CONFIG = dict(competition = "Feedback Prize Effectiveness", 
              dropout = DROPOUT,
              learning_rate = LEARNING_RATE,
              epochs = EPOCHS,
              batch_size = BATCH_SIZE,
              model = MODEL
             )

run = wandb.init(name = f"Run_{MODEL}_{DROPOUT}_{LEARNING_RATE}_{EPOCHS}", 
                 project = "Feedback Prize Effectiveness", 
                 config = CONFIG)
config = wandb.config

wandb: Currently logged in as: vivdenx. Use `wandb login --relogin` to force relogin


In [16]:
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL)
tokenizer.save_pretrained('.')

('.\\vocab.txt', '.\\special_tokens_map.json', '.\\added_tokens.json')

In [21]:
train = pd.read_csv('./data/train_clean.csv')
train["label"] = train["discourse_effectiveness"].replace({"Ineffective": 0, "Adequate": 1, "Effective": 2})
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_no_punct,discourse_num_words,label
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi im isaac im going to be writing about how t...,67,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,on my perspective i think that the face is a n...,41,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,i think that the face is a natural landform be...,21,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,if life was on mars we would know by now the r...,72,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought that the face was formed by ali...,18,1


In [22]:
sep = tokenizer.sep_token
print(sep)

train['inputs'] = train.discourse_type + sep + train.discourse_text
train.head()

[SEP]


Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_no_punct,discourse_num_words,label,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi im isaac im going to be writing about how t...,67,1,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,on my perspective i think that the face is a n...,41,1,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,i think that the face is a natural landform be...,21,1,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,if life was on mars we would know by now the r...,72,1,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought that the face was formed by ali...,18,1,Counterclaim[SEP]People thought that the face ...


In [23]:
print('Sample input sequence:')
sample_sequence = train['inputs'].iloc[0]
print(sample_sequence)

print('\nTokenized sequence:')
print(tokenizer.tokenize(sample_sequence))

token = tokenizer(sample_sequence, 
                  max_length=MAX_LEN, 
                  truncation=True, 
                  padding='max_length',
                  add_special_tokens = True)
    
print('\ninput_ids:')
print(token['input_ids'])
print('\ntoken_type_ids:')
print(token['token_type_ids'])
print('\nattention_mask:')
print(token['attention_mask'])

Sample input sequence:
Lead[SEP]Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. 

Tokenized sequence:
['Lead', '[SEP]', 'Hi', ',', 'i', "'", 'm', 'Isaac', ',', 'i', "'", 'm', 'going', 'to', 'be', 'writing', 'about', 'how', 'this', 'face', 'on', 'Mars', 'is', 'a', 'natural', 'land', '##form', 'or', 'if', 'there', 'is', 'life', 'on', 'Mars', 'that', 'made', 'it', '.', 'The', 'story', 'is', 'about', 'how', 'NASA', 'took', 'a', 'picture', 'of', 'Mars', 'and', 'a', 'face', 'was', 'seen', 'on', 'the', 'planet', '.', 'NASA', 'doesn', "'", 't', 'know', 'if', 'the', 'land', '##form', 'was', 'created', 'by', 'life', 'on', 'Mars', ',', 'or', 'if', 'it', 'is', 'just', 'a', 'natural', 'land', '##form', '.']

input_ids:
[101, 10440, 10

In [24]:
def bert_encode(texts, tokenizer, max_len = MAX_LEN):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, 
                          max_length=max_len, 
                          truncation=True, 
                          padding='max_length',
                          add_special_tokens = True)
        
        input_ids.append(token['input_ids'])
        token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

In [27]:
input_ids = Input(shape = (MAX_LEN, ), dtype = tf.int32, name = "input_ids")
token_type_ids = Input(shape = (MAX_LEN, ), dtype = tf.int32, name = "token_type_ids")
attention_mask = Input(shape = (MAX_LEN, ), dtype = tf.int32, name = "attention_mask")

transformer_layer = (TFBertModel.from_pretrained(MODEL))

sequence_output = transformer_layer(input_ids, 
                                    token_type_ids = token_type_ids, 
                                    attention_mask = attention_mask)[0]
clf_output = sequence_output[:, 0, :]
clf_output = Dropout(config.dropout)(clf_output)
out = Dense(3, activation='softmax')(clf_output)

model = Model(inputs = [input_ids, token_type_ids, attention_mask], 
              outputs = out)
model.compile(Adam(learning_rate = config.learning_rate), loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 256, 768), ( 108310272   input_ids[0][0]                  
______________________________________________________________________________________________

In [None]:
from sklearn.model_selection import GroupKFold
from wandb.keras import WandbCallback

X = train['inputs']
y = train['label']

kf = GroupKFold(n_splits = 5)

for i, (train_index, val_index) in enumerate(kf.split(X, y, train["essay_id"])):  
    print(f"Fold {i+1}: Train Set: {train.loc[train_index, 'essay_id'].nunique()}, Validation Set: {train.loc[val_index, 'essay_id'].nunique()}")

    X_train = X.loc[train_index].values
    X_train = bert_encode(X_train.astype(str), tokenizer)

    X_valid = X.loc[val_index].values
    X_valid = bert_encode(X_valid.astype(str), tokenizer)

    y_train = y[train_index].values
    y_valid = y[val_index].values
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((X_train, y_train))
        .repeat()
        .shuffle(SEED)
        .batch(config.batch_size)
        .prefetch(AUTO)
    )

    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((X_valid, y_valid))
        .batch(config.batch_size)
        .cache()
        .prefetch(AUTO)
    )
    
    
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=200, #350
        validation_data=valid_dataset,
        epochs=config.epochs, # 20
        callbacks=[WandbCallback()], # Add WandbCallback() to the fit function
        verbose = 2,
    )
    
    # Validation
    y_valid_pred = model.predict(X_valid, verbose=1)
    print(f"Validation Log Loss {log_loss(y_valid, y_valid_pred):.2f}")

Fold 1: Train Set: 3352, Validation Set: 839




Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`


Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`


















In [None]:
https://www.kaggle.com/code/iamleonie/feedback-prize-eda-starter-for-beginners