Install Packages


In [41]:
!pip install -q spacy
!pip install -q spacy-lookups-data

!pip install benepar

!pip install -q transformers

!pip install huggingface_hub


!pip install pydot
!pip install evaluate





In [42]:
import spacy, benepar
!python -m spacy download en_core_web_md

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

from transformers import BertTokenizer, TFBertModel

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [43]:
#Defines the Models
spacy_checkpoint = "en_core_web_md"

bert_checkpoint = 'bert-base-cased'

benepar.download('benepar_en3')

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


True

In [44]:
nlp = spacy.load(spacy_checkpoint)

if spacy.__version__.startswith('2'):
  nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
  nlp.add_pipe("benepar", config={"model": "benepar_en3"})


bert_tokenizer = BertTokenizer.from_pretrained(bert_checkpoint)
bert_model = TFBertModel.from_pretrained(bert_checkpoint)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [45]:
val_df = pd.read_csv('cloze_train_winter2018.csv')
test_df = pd.read_csv('cloze_test_winter2018.csv')
train_df = pd.read_csv('ROCStories_winter2017.csv')

Modify Training Dataset (Geronimo's Shuffle)
1. create random incorrect ending
2. shuffle position of correct and incorrect ending
3. define lables based on shuffled positions

In [46]:
# function to randomly shuffle each row across the correct and incorrect ending columns
def shuffle_row(row):
    values = row.values
    np.random.shuffle(values)
    return pd.Series(values, index=row.index)

In [47]:
train_df['WrongSentence'] = train_df['sentence5'].sample(frac=1,replace=False).values + 'INCORRECT_ENDING'
train_df.iloc[:,-2:] = train_df.iloc[:,-2:].apply(shuffle_row,axis=1)

train_df['label'] = train_df.apply(lambda row: 1 if 'INCORRECT_ENDING' in row['sentence5'] else (0 if 'INCORRECT_ENDING' in row['WrongSentence'] else None), axis=1)
train_df.iloc[:,-3:-1] = train_df.iloc[:,-3:-1].applymap(lambda row: row.replace('INCORRECT_ENDING', ''))

train_df = train_df.rename(columns={'sentence5':'ending0','WrongSentence':'ending1'})

# Approach Undecided: Will pursue combining all the sentences first. Will combine endings into column of choices.

In [51]:
def preprocess(df):
    story = " ".join([df['sentence1'], df['sentence2'], df['sentence3'], df['sentence4']])
    choices = [df['ending0'], df['ending1']]
    return story, choices

In [52]:
train_df['story'], train_df['choices'] = zip(*train_df.apply(preprocess, axis=1))
train_df

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,ending0,ending1,label,story,choices
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet...",Tom made a snowman.,0,David noticed he had put on a lot of weight re...,"[After a few weeks, he started to feel much be..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Harold ate the rest of the pizza the next day.,Tom sat on his couch filled with regret about ...,1,Tom had a very short temper. One day a guest m...,[Harold ate the rest of the pizza the next day...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...,Tom's roommate wound up being expelled from th...,0,Marcus needed clothing for a business casual e...,[Marcus was happy to have the right clothes fo...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,She was so happy when he said yes.,He ended up buying the truck he wanted despite...,1,Bobby thought Bill should buy a trailer and ha...,"[She was so happy when he said yes., He ended ..."
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.,Daniel had put the fire out and learned not to...,0,John was a pastor with a very bad memory. He t...,[His congregation was delighted and so was he....
...,...,...,...,...,...,...,...,...,...,...,...
52660,134e8636-3617-43d8-ba6a-9a11b3b115b1,Flavor,The man liked the flavor.,He tried to recreate it at home.,He could not get the flavor right.,He asked the owner of the recipe for help.,She even climbed out of her crib looking for h...,The owner of the flavor sold him the recipe.,1,The man liked the flavor. He tried to recreate...,[She even climbed out of her crib looking for ...
52661,4c317f76-ca42-4024-a4c2-12ec911cf89b,After Death,"After my friend's dad's funeral, I got in trou...",The principal said I wasn't allowed to leave s...,He found out I had my friend sign me out.,He told me I was getting detention.,Tom has reconsidered his position on the benef...,I skipped detention all week.,1,"After my friend's dad's funeral, I got in trou...",[Tom has reconsidered his position on the bene...
52662,a18fd0d2-4d0c-4316-befe-e3d827fe699b,Janice breaks her wrist,Janice was out exercising for her big soccer g...,She was doing some drills with her legs.,While working out and exercising she slips on ...,She falls down and uses her wrist to break her...,Now Dan plays drums in a band who is on tour!,She breaks her wrist in the process and goes t...,1,Janice was out exercising for her big soccer g...,[Now Dan plays drums in a band who is on tour!...
52663,2c14252b-4080-4fca-8765-537772018508,Jamie marries for love,Jamie is an american girl.,Jamie wants to get married to a mexican man.,Her family assumes it's because the man wants ...,Jamie insist that she is marrying him out of l...,Jamie gets married and they spent the rest of ...,Now my face is all red from having the pepper ...,0,Jamie is an american girl. Jamie wants to get ...,[Jamie gets married and they spent the rest of...


In [92]:
# To Help Visualize Dependency Tree
import nltk
from nltk import Tree, ParentedTree

In [195]:
# Example of Parser Working on Data
doc = nlp(train_df.loc[0,'story'])
# print (doc)
sent = list(doc.sents)[0]
x = sent._.parse_string
print (x)
parse_tree = ParentedTree.fromstring('(' + sent._.parse_string + ')')
print(parse_tree.pretty_print())
print (sent._.parse_string)

(S (NP (NNP David)) (VP (VBD noticed) (SBAR (S (NP (PRP he)) (VP (VBD had) (VP (VBN put) (PRT (RP on)) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN weight)))) (ADVP (RB recently))))))) (. .))
                                                                          
                            |                                              
                            S                                             
   _________________________|___________________________________________   
  |            VP                                                       | 
  |       _____|_______                                                 |  
  |      |            SBAR                                              | 
  |      |             |                                                |  
  |      |             S                                                | 
  |      |      _______|________                                        |  
  |      |     |                VP                   



In [186]:
#Code to Create Dependency
def create_dependency(df):
    df['dependency_tree'] = None
    for index, row in df.iterrows():
        doc = nlp(row['story'])
        dependency = [sent._.parse_string for sent in doc.sents]
        df.at[index, 'dependency_tree'] = dependency
    return df


In [192]:
subset = train_df[:10]
subset = create_dependency(subset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dependency_tree'] = None


In [194]:
# train_df = train_df.drop(['dependency_tree'], axis=1)
subset.loc[0,'dependency_tree']

['(S (NP (NNP David)) (VP (VBD noticed) (SBAR (S (NP (PRP he)) (VP (VBD had) (VP (VBN put) (PRT (RP on)) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN weight)))) (ADVP (RB recently))))))) (. .))',
 '(S (NP (PRP He)) (VP (VBD examined) (NP (PRP$ his) (NNS habits)) (S (VP (TO to) (VP (VP (VB try)) (CC and) (VP (VB figure) (PRT (RP out)) (NP (DT the) (NN reason))))))) (. .))',
 "(S (NP (PRP He)) (VP (VBD realized) (SBAR (S (NP (PRP he)) (VP (VBD 'd) (VP (VBN been) (VP (VBG eating) (NP (ADJP (RB too) (JJ much)) (JJ fast) (NN food)) (ADVP (RB lately)))))))) (. .))",
 '(S (NP (PRP He)) (VP (VP (VBD stopped) (S (VP (VBG going) (PP (IN to) (NP (NN burger) (NNS places)))))) (CC and) (VP (VBD started) (NP (DT a) (JJ vegetarian) (NN diet)))) (. .))']

In [35]:
def tokenize_story_choices(row):
    story_tokens = tokenizer.encode_plus(row['story'], add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_tensors='tf')
    choice_tokens = tokenizer.encode_plus(row['choices'], add_special_tokens=False, max_length=128, truncation=True, padding='max_length', return_tensors='tf')
    return {
        'input_word_ids': story_tokens['input_ids'][0],
        'input_mask': story_tokens['attention_mask'][0],
        'segment_ids': story_tokens['token_type_ids'][0]
    }, {
        'input_word_ids': choice_tokens['input_ids'],
        'input_mask': choice_tokens['attention_mask'],
        'segment_ids': choice_tokens['token_type_ids']
    }

In [37]:
train_df['input'], train_df['labels'] = zip(*train_df.apply(tokenize_story_choices, axis=1))

NameError: name 'tokenizer' is not defined