In [1]:
# import packages
from sklearn import tree
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk  #The Natural Language Toolkit
import tensorflow as tf
import transformers
from transformers import BertTokenizer, TFBertModel

In [2]:
# import transformers
#!pip install -q transformers

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [3]:
test = pd.read_csv("/home/vishakhpillai/w266_aes/Sharon/asap-aes/test_set.tsv",sep='\t', encoding='ISO-8859-1')
dev = pd.read_csv("/home/vishakhpillai/w266_aes/Sharon/asap-aes/valid_set.tsv",sep='\t', encoding='ISO-8859-1')
train = pd.read_csv("/home/vishakhpillai/w266_aes/Sharon/asap-aes/training_set_rel3.tsv",sep='\t', encoding='ISO-8859-1')

In [5]:
max_length = 80
num_train_examples = 2500

def training_pipeline(df):

    """
        Simplify dataframe and normalize scores based on essay set.
    """
    df = df[['essay_set','essay','rater1_domain1','rater2_domain1','domain1_score']]
    df = normalize(df)
    
   
    """
        Create features based on length, average/std of word length, unique words. 
    """
    
    df['length'] = df['essay'].str.split().str.len()
    df['avg_word_len'] = df['essay'].str.split().apply(lambda x: np.mean([len(y) for y in x]))
    df['std_word_len'] = df['essay'].str.split().apply(lambda x: np.std([len(y) for y in x]))
    df['unique_words'] = df['essay'].str.split().apply(lambda x: len(np.unique(x)))
    
    """
        Run first stage of network with BERT and predict output.
    """
    x_train = tokenize_function(df['essay'].tolist())
    y_train = df.normalized_score[:num_train_examples]
    fsm = first_stage_model()
    fsm.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                        epochs=5,
                        batch_size=8)
    X_fsm = fsm.predict(x_train)
   
    """
        Combine output of first-stage model with other features for second stage model.
    """

    X_ext = np.array([X_fsm, df['length'], df['avg_word_len'], df['std_word_len'], df['unique_words']])


    """
        Run second stage model.
    """
    
    ssm = second_stage_model()
    ssm.fit(X_ext, y_train) 
    
    
    
def normalize(df):
    df['normalized_score'] = df['domain1_score'] / df.groupby('essay_set')['domain1_score'].transform('max')
    return df
    
def tokenize_function(example):
    return tokenizer([x[2:] for x in example[:num_train_examples]], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

def first_stage_model(hidden_size = 200, optimizer=tf.keras.optimizers.Adam()):
    
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)


    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(bert_out[0])

    
    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(classification_token)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics='accuracy')


    return classification_model
    
def second_stage_model():
    return DecisionTreeClassifier(random_state=0)

In [None]:
training_pipeline(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['normalized_score'] = df['domain1_score'] / df.groupby('essay_set')['domain1_score'].transform('max')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['length'] = df['essay'].str.split().str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_word_len'] = df['essay'].str.split().apply(

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 1/5
 12/313 [>.............................] - ETA: 30:55 - loss: 3.7976 - accuracy: 0.0132

In [None]:
dev

In [None]:
x_train