In [1]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)

In [4]:
import pathlib
import os
import matplotlib.pyplot as plt
import seaborn as sns

import json
import lzma
from bs4 import BeautifulSoup
from tqdm import tqdm
from IPython.core.display import display, HTML
import re

import nltk
nltk.download('punkt')

from nltk.tokenize import RegexpTokenizer
import string
import datetime as dt

import numpy as np  
import pandas as pd 
import re           
from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)

[nltk_data] Downloading package punkt to /home/20765011/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## <a id='1'>1. Pre-process X_train</a>

In [6]:
df = pd.read_csv("df_small.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,token_ops,token_heads,op counts,head counts
0,3,"Walker , J . ##SENT## The grievance alleged by the plaintiff is that the defendant failed to place a refrigerating car for him at Wrightsboro by 2 :30 o ’clock p . m . on 16 Hay , 1918 , to receiv...","D . K . FUTCH v . ATLANTIC COAST LINE RAILROAD COMPANY . ##SENT## (Filed 15 October , 1919 .) ##SENT## 1 . ##SENT## Carriers of Goods — Placing of Cars — Understanding of Agent — Instructions — Ra...",1347,361
1,4,"Hoke , J . ##SENT## On the hearing it appeared that C . G . Bailey and others , executors of the last will and testament of W . A . Bailey , deceased , intended presently to make sale of a tract o...","MRS . SUSANNA WILLIAMS v . C . G . BAILEY , B . R . BAILEY et al ., Executors of W . R . BAILEY . ##SENT## (Filed 3 December , 1919 .) ##SENT## 1 . ##SENT## Deeds and Conveyances — Descriptions —R...",1649,361
2,5,"HoKE , J . ##SENT## The facts pertinent to the inquiry and showing the action of the Superior Court thereon are very satisfactorily stated in the appellant ’s brief filed in the cause , and are as...","J . DICKSON McLEAN , Commissioner , EUGENE BOND , VICTOR BOND , ALLEN BOND , R . S . BOND , Executor and Trustee , Etc ., and W . LENNON , Guardian , Etc ., v . S . F . CALDWELL . ##SENT## (Filed ...",1229,319
3,9,"Beown , J . ##SENT## Defendant was convicted at tbe June Term , 1919 , of Guilford County Superior Court , of bigamous cohabitation , under Rev ., 3361 , as amended by chapter 26 , Public Laws 191...","STATE v . JOHN W . MOON . ##SENT## (Filed 5 November , 1919 .) ##SENT## 1 . ##SENT## Statutes — Amendments —Effect. ##SENT## The effect of an amendment to a statute is to incorporate the old statu...",839,290
4,10,"ClaRK , C . J . ##SENT## There is no exception to evidence or the charge . ##SENT## The sole assignment of error is the refusal of the motion to sever . ##SENT## From S . v . Smith , 24 N . C ., 4...","STATE v . ASHLEY SOUTHERLAND . ##SENT## (Filed 24 September , 1919 .) ##SENT## Indictments — Severance —Motions—Murder—Different Defenses — Conspiracy . ##SENT## Upon a motion for a severance unde...",1318,303


In [40]:
# I don't need to get embeddings for headnote tokens, because our
# RNN will only extract sentences from the opinion body text;
# thus, our model never needs to "read" the headnotes
all_text = df['token_ops']

In [79]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

In [84]:
%%time
all_text = []
for doc in df['token_ops']:
    x = doc.replace("##SENT##", "")
    x_tok = nltk.tokenize.WordPunctTokenizer().tokenize(x)

    all_text.extend(x_tok)

CPU times: user 19.7 s, sys: 1.34 s, total: 21.1 s
Wall time: 21 s


In [87]:
x_arry = np.array([np.array(xi) for xi in x_train])

In [91]:
words = np.unique(all_text)
n_words = len(words)

In [93]:
print("Vocab size: {}".format(n_words))

Vocab size: 139335


In [94]:
word_index = ['_PADDING_'] + list(words)

In [None]:
word_index

In [95]:
%%time
sent_lengths = []
for doc in df['token_ops']:
    sents_in_doc = doc.split('##SENT##')
    sent_lengths.append(len(sents_in_doc))


CPU times: user 389 ms, sys: 0 ns, total: 389 ms
Wall time: 387 ms


In [160]:
# max words in a sentence
# Use this size for maxpooling layer
max_sent_len = max(sent_lengths)
print(max_sent_len)

240


In [106]:
# dictionary for word index (vocabulary)
word2idx = dict(zip(word_index, range(n_words+1)))
idx2word = dict(zip(range(n_words+1), word_index))

In [150]:
%%time
# convert to numeric using word2idx and add padding

X = []
for doc in df['token_ops']:
    sents_in_doc = doc.split('##SENT##')
    
    mod_doc = []
    for sent in sents_in_doc:
        
        mod_sent=[]
        x_tokens = nltk.tokenize.WordPunctTokenizer().tokenize(sent)
        # Convert tokens in a sentence to index numbers
        for token in x_tokens:
            mod_sent.append(word2idx[token])
        mod_doc.append(mod_sent)
    X.append(pad_sequences(mod_doc, maxlen=max_sent_len, padding='post', value=0))

CPU times: user 47.7 s, sys: 6.6 ms, total: 47.7 s
Wall time: 47.7 s


In [167]:
np.concatenate(([], X[0][1])).shape

(240,)

In [171]:
X_train = []
for doc in X:
    concat_doc = []
    for sent in doc:
        concat_doc.extend(sent)
    X_train.append(concat_doc)

In [181]:
doc_lengths = []
for doc in X_train:
    doc_lengths.append(len(doc))
max_doc_len = max(doc_lengths)
print("Max document length, by words: {}".format(max_doc_len))

Max document length, by words: 57600


In [183]:
X_tr_final = pad_sequences(X_train, maxlen=max_doc_len, padding='post', value=0)

In [185]:
X_tr_final.shape

(24603, 57600)

## <a id='2'>2. Pre-process Y_train labels</a>

In [243]:
y_unproc = []
rouge_scores = []

In [244]:
files=["oracle_batch1.txt", "oracle_batch2.txt", "oracle_batch3.txt",
      "oracle_batch4.txt", "oracle_batch6.txt",
      "oracle_batch7.txt", "oracle_batch8.txt", "oracle_batch9.txt",
      "oracle_batch10.txt", "oracle_batch11.txt"]

In [245]:
# https://stackoverflow.com/questions/6633678/finding-words-after-keyword-in-python

for file in files:
    f = open(file, "r")
    for line in f:
        y_tup, split, rouge_score = line.partition('\t')
        rouge_score = rouge_score.strip('\n')
        y_unproc.append(y_tup)
        rouge_scores.append(rouge_score)

In [246]:
len(y_unproc)

6000

In [247]:
# toy X_train, based off of indices of my generated y labels
X_tr_toy = np.concatenate((X_tr_final[:1900], X_tr_final[2900:7000]))

## <a id='3'>3. Small Model</a>

In [250]:
# Get GloVE word embeddings
# https://nlp.stanford.edu/projects/glove/
## I used wikipedia 2014+ Gigaword
# Extract word vectors
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [253]:
EMBEDDING_DIM = 100

In [254]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [256]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_doc_len,
                            trainable=False)

In [260]:
# architecture inspired by SummaRunner 
# https://github.com/hpzhao/SummaRuNNer/blob/master/models/RNN_RNN.py
n_units=100
optimizer = "adam"
loss = "binary_crossentropy"
metrics = ["accuracy"]

seq_input = Input(shape=(max_doc_len,))
embedded_seq = embedding_layer(seq_input)
# Word-level GRU
x = Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True))(embedded_seq)
x = tf.keras.layers.MaxPool1D(pool_size = max_sent_len, padding='same')(x)

# Sentence-level GRU after maxpooling all words in a sentence
x = Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True))(x)

# Classification at the sentence level
output = Dense(max_doc_len, activation='sigmoid')(x)

model = tf.keras.Model(inputs=seq_input, outputs=output) 

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 57600)]           0         
_________________________________________________________________
embedding (Embedding)        (None, 57600, 100)        13933700  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 57600, 200)        121200    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 240, 200)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 240, 200)          181200    
_________________________________________________________________
dense (Dense)                (None, 240, 57600)        11577600  
Total params: 25,813,700
Trainable params: 11,880,000
Non-trainable params: 13,933,700
________________________________________

In [278]:
y_unproc = np.array(y_unproc)

indices = np.arange(X_tr_toy.shape[0])
np.random.shuffle(indices)
X_tr_toy = X_tr_toy[indices]
y_unproc = y_unproc[indices]

x_train = X_tr_toy[600:]
y_train = y_unproc[600:]
x_val = X_tr_toy[:600]
y_val = y_unproc[:600]


In [279]:
%%time
# Train model
verbose = 1

callback=tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=4)
history = model.fit(x_train, y_train, batch_size=64, epochs=5,#epochs, 
                    validation_data=(x_val, y_val), verbose=verbose,
                    shuffle=True,
                    callbacks= [callback]
                   )

ValueError: A target array with shape (5400, 1) was passed for an output of shape (None, 240, 57600) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.