In [9]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from transformers import TFBertModel
import transformers

In [15]:
# Configuration
BATCH_SIZE = 16
MAX_LEN = 256 
DROPOUT = 0.1 # 0.2
LEARNING_RATE = 1e-5
EPOCHS = 1#8
AUTO = tf.data.experimental.AUTOTUNE
MODEL = "bert-base-cased"

In [5]:
import wandb
# Initialize a W&B run for logging
CONFIG = dict(competition = "Feedback Prize Effectiveness", 
              dropout = DROPOUT,
              learning_rate = LEARNING_RATE,
              epochs = EPOCHS,
              batch_size = BATCH_SIZE,
              model = MODEL
             )

run = wandb.init(name = f"Run_{MODEL}_{DROPOUT}_{LEARNING_RATE}_{EPOCHS}", 
                 project = "Feedback Prize Effectiveness", 
                 config = CONFIG)
config = wandb.config

wandb: Currently logged in as: vivdenx. Use `wandb login --relogin` to force relogin


In [16]:
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL)
tokenizer.save_pretrained('.')

('.\\vocab.txt', '.\\special_tokens_map.json', '.\\added_tokens.json')

In [21]:
train = pd.read_csv('./data/train_clean.csv')
train["label"] = train["discourse_effectiveness"].replace({"Ineffective": 0, "Adequate": 1, "Effective": 2})
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_no_punct,discourse_num_words,label
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi im isaac im going to be writing about how t...,67,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,on my perspective i think that the face is a n...,41,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,i think that the face is a natural landform be...,21,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,if life was on mars we would know by now the r...,72,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought that the face was formed by ali...,18,1


In [22]:
sep = tokenizer.sep_token
print(sep)

train['inputs'] = train.discourse_type + sep + train.discourse_text
train.head()

[SEP]


Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_no_punct,discourse_num_words,label,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi im isaac im going to be writing about how t...,67,1,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,on my perspective i think that the face is a n...,41,1,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,i think that the face is a natural landform be...,21,1,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,if life was on mars we would know by now the r...,72,1,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought that the face was formed by ali...,18,1,Counterclaim[SEP]People thought that the face ...


In [23]:
print('Sample input sequence:')
sample_sequence = train['inputs'].iloc[0]
print(sample_sequence)

print('\nTokenized sequence:')
print(tokenizer.tokenize(sample_sequence))

token = tokenizer(sample_sequence, 
                  max_length=MAX_LEN, 
                  truncation=True, 
                  padding='max_length',
                  add_special_tokens = True)
    
print('\ninput_ids:')
print(token['input_ids'])
print('\ntoken_type_ids:')
print(token['token_type_ids'])
print('\nattention_mask:')
print(token['attention_mask'])

Sample input sequence:
Lead[SEP]Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. 

Tokenized sequence:
['Lead', '[SEP]', 'Hi', ',', 'i', "'", 'm', 'Isaac', ',', 'i', "'", 'm', 'going', 'to', 'be', 'writing', 'about', 'how', 'this', 'face', 'on', 'Mars', 'is', 'a', 'natural', 'land', '##form', 'or', 'if', 'there', 'is', 'life', 'on', 'Mars', 'that', 'made', 'it', '.', 'The', 'story', 'is', 'about', 'how', 'NASA', 'took', 'a', 'picture', 'of', 'Mars', 'and', 'a', 'face', 'was', 'seen', 'on', 'the', 'planet', '.', 'NASA', 'doesn', "'", 't', 'know', 'if', 'the', 'land', '##form', 'was', 'created', 'by', 'life', 'on', 'Mars', ',', 'or', 'if', 'it', 'is', 'just', 'a', 'natural', 'land', '##form', '.']

input_ids:
[101, 10440, 10