In [3]:
# import packages
from sklearn import tree
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk  #The Natural Language Toolkit
import tensorflow as tf
import transformers
from transformers import BertTokenizer, TFBertModel
import more_itertools

In [4]:
# import transformers
#!pip install -q transformers

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
test = pd.read_csv("/home/vishakhpillai/w266_aes/Sharon/asap-aes/test_set.tsv",sep='\t', encoding='ISO-8859-1')
dev = pd.read_csv("/home/vishakhpillai/w266_aes/Sharon/asap-aes/valid_set.tsv",sep='\t', encoding='ISO-8859-1')
train = pd.read_csv("/home/vishakhpillai/w266_aes/Sharon/asap-aes/training_set_rel3.tsv",sep='\t', encoding='ISO-8859-1')

In [20]:
def tokenized(df, examples):
    input_ids = []
    token_type_ids = [] 
    attention_masks = []
    window = 3
    for sentences in df['sent'][:examples]:
        if len(sentences) > window:
            i = list(more_itertools.windowed([k for k in range(len(sentences))],n=window, step=1))
        else:
            i = list(more_itertools.windowed([k for k in range(len(sentences))],n=len(sentences), step=1))
        w_sentences = []
        w = ''
        for hi in i:
            w = ''
            for j in hi:
                w = w + sentences[j] + ' '
            w_sentences.append(w)
        encoded_dict = tokenizer([x for x in w_sentences], 
              max_length=512,
              add_special_tokens = True,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
         # Add the encoded sentence to the list    
        input_ids.append(encoded_dict['input_ids'])
         # Add the encoded sentence to the list.    
        token_type_ids.append(encoded_dict['token_type_ids'])
         # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
    return input_ids, token_type_ids, attention_masks
    

In [8]:
from nltk import tokenize
nltk.download('punkt')

train['sent'] = train['essay'].apply(lambda x: tokenize.sent_tokenize(x))


[nltk_data] Downloading package punkt to
[nltk_data]     /home/vishakhpillai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
z = tokenized(train, 1000)

In [23]:
z[0]

[<tf.Tensor: shape=(14, 512), dtype=int32, numpy=
 array([[  101, 12956,  1469, ...,     0,     0,     0],
        [  101, 11675,  1164, ...,     0,     0,     0],
        [  101,  1790,  1204, ...,     0,     0,     0],
        ...,
        [  101,  1409,  1240, ...,     0,     0,     0],
        [  101,  1192,  1547, ...,     0,     0,     0],
        [  101, 11056,  1190, ...,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: shape=(18, 512), dtype=int32, numpy=
 array([[  101, 12956,   137, ...,     0,     0,     0],
        [  101,  7993,  7565, ...,     0,     0,     0],
        [  101,  2907,  7565, ...,     0,     0,     0],
        ...,
        [  101,  2907,  1165, ...,     0,     0,     0],
        [  101,  1337,  1110, ...,     0,     0,     0],
        [  101,   146,  2810, ...,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: shape=(12, 512), dtype=int32, numpy=
 array([[  101, 12956,   117, ...,     0,     0,     0],
        [  101,  4435,  1150, ...,     0,     0,   

In [166]:
k = 1
for sentences in train['sent']:
    i = list(more_itertools.windowed([k for k in range(len(sentences))],n=3, step=1))
    if k == 1:
        break
i

[(0, 1, 2),
 (1, 2, 3),
 (2, 3, 4),
 (3, 4, 5),
 (4, 5, 6),
 (5, 6, 7),
 (6, 7, 8),
 (7, 8, 9),
 (8, 9, 10),
 (9, 10, 11),
 (10, 11, 12),
 (11, 12, 13),
 (12, 13, 14),
 (13, 14, 15)]

In [147]:
i = list(more_itertools.windowed([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],n=3, step=1))
the_train = []
new_train = ''
for hi in i:
    new_train = ''
    for j in hi:
        new_train = new_train + train['sent'][0][j] + ' '
    the_train.append(new_train)

In [165]:
len(train['sent'][4])

30

In [10]:
max_length = 80
num_train_examples = 1000

def training_pipeline(df):

    """
        Simplify dataframe and normalize scores based on essay set.
    """
    df = df[['essay_set','essay','rater1_domain1','rater2_domain1','domain1_score']]
    df = normalize(df)
    
   
    """
        Create features based on length, average/std of word length, unique words. 
    """
    
    df['length'] = df['essay'].str.split().str.len()
    df['avg_word_len'] = df['essay'].str.split().apply(lambda x: np.mean([len(y) for y in x]))
    df['std_word_len'] = df['essay'].str.split().apply(lambda x: np.std([len(y) for y in x]))
    df['unique_words'] = df['essay'].str.split().apply(lambda x: len(np.unique(x)))
    
    """
        Run first stage of network with BERT and predict output.
    """
    x_train = tokenize_function(df['essay'].tolist())
    y_train = df.normalized_score[:num_train_examples]
    fsm = first_stage_model()
    fsm.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], y_train, epochs=5, batch_size=8)
    X_fsm = fsm.predict(x_train)
   
    """
        Combine output of first-stage model with other features for second stage model.
    """

    X_ext = np.array([X_fsm, df['length'], df['avg_word_len'], df['std_word_len'], df['unique_words']])


    """
        Run second stage model.
    """
    
    ssm = second_stage_model()
    ssm.fit(X_ext, y_train) 
    
    
    
def normalize(df):
    df['normalized_score'] = df['domain1_score'] / df.groupby('essay_set')['domain1_score'].transform('max')
    return df
    
def tokenize_function(example):
    return tokenizer([x for x in example[:num_train_examples]], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

def first_stage_model(hidden_size = 200, optimizer=tf.keras.optimizers.Adam()):
    
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)


    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(bert_out[0])

    
    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(classification_token)

    classification = tf.keras.layers.Dense(8, activation='softmax',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.MeanAbsoluteError(name='mean_absolute_error'),
                            metrics=['accuracy'])


    return classification_model
    
def second_stage_model():
    return DecisionTreeClassifier(random_state=0)

In [None]:
training_pipeline(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['normalized_score'] = df['domain1_score'] / df.groupby('essay_set')['domain1_score'].transform('max')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['length'] = df['essay'].str.split().str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_word_len'] = df['essay'].str.split().apply(

Epoch 1/5


In [None]:
dev

In [None]:
x_train = tokenizer([x[2:] for x in train['essay'].tolist()[:num_train_examples]], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

In [None]:
train.head()

In [None]:
x_train.input_ids[0]