# Part 1) Prepare the main test set CSV from TXT files.

In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
df = pd.read_csv('../data/Gen_Sentences_Annotated_All_Final_Processed.csv')
df.head(2)

Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID
0,e,156,nlh.47.1.626118_nonotes.txt,"To this end, one of the main merits of Merleau...",1,0,0,0,0,1
1,b,207,ahr.2016.121.2.437_nonotes.txt,In their response they chastised her for her u...,1,0,0,0,0,2


###### Get sentences from txt(s):

In [3]:
def cleanUpString(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])


def get_top_bottom_sentences(fname):
    """
    Given a filename, return a list of sentences that include the first ~1500 words and last ~1000 words.
    
    Parameters
    ----------
    fname : str
        Filename for the txt
        
    Returns
    -------
    list, list
        The first is a list of intro sentences; second is a list of conclusion sentences.
    """
    with open(PATH+fname, 'r') as f:
        txt = f.read()
    
    clean_txt = cleanUpString(txt)
        
    sents = sent_tokenize(clean_txt)
 
    ## First 1500 words ##
    intro_sentences = []
    counter1 = 0
    for sent in sents:
        intro_sentences.append(sent)
        # Keep track of number of words covered, and break out of the loop if it crosses 1500.
        words = word_tokenize(sent)
        counter1 += len(words)
        if counter1 > 1500:
            break

    ## Final 1000 words ##
    conclusion_sentences = []
    counter2 = 0
    for sent in reversed(sents):
        conclusion_sentences.append(sent)
        # Keep track of number of words covered, and break out of the loop if it crosses 1000.
        words = word_tokenize(sent)
        counter2 += len(words)
        if counter2 > 1000:
            break
            
    return intro_sentences, conclusion_sentences

In [4]:
map_fname_sents = defaultdict(dict) # {fname1: {'INTRO': [list-of-sents], 'CONCLUSION': [list-of-sents]}, 
                                    #  fname2: {..} ..}
PATH = '/Users/sunyambagga/Desktop/txtLAB/Sentence-Prediction/txt_nonotes/'

c = 0
for fname in os.listdir(PATH):
    if '.DS_Store' in fname:
        continue
    
    intro_sents, conclusion_sents = get_top_bottom_sentences(fname)
    
    map_fname_sents[fname]['INTRO'] = intro_sents
    map_fname_sents[fname]['CONCLUSION'] = conclusion_sents
    
    c += 1

print("Processed {} files.".format(c))
print(len(map_fname_sents))

Processed 230 files.
230


In [5]:
# Write to CSV:
total_sents = 0

input_dict = {'sentences': [], 'filename': [], 'section': [], 'ID': []}

for fname in map_fname_sents:
    for key in map_fname_sents[fname]: # key is either 'INTRO' or 'CONCLUSION'
        for sent_number, sent in enumerate(map_fname_sents[fname][key]):
            total_sents += 1
            
            ID = fname[:-4]+'__'+key+'__'+ str(sent_number) # Create unique ID using fname, INTRO/CON, sent number
            input_dict['sentences'].append(sent)
            input_dict['filename'].append(fname)
            input_dict['section'].append(key)
            input_dict['ID'].append(ID)
            
print("Total of {} sentences.".format(total_sents))

Total of 16816 sentences.


In [6]:
df = pd.DataFrame(input_dict)
df['neutral'] = ""
df['generalization'] = ""

print(df.shape)
df.head(2)

(16816, 6)


Unnamed: 0,sentences,filename,section,ID,neutral,generalization
0,WHEN ABEL TASMAN SET OFF ON HIS voyage to disc...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__0,,
1,1 These typically confident directives encapsu...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__1,,


In [7]:
## df.to_csv('../data/TEST_SET.csv', index=None)

# Part 2) Get CNN predictions on test set:

### 2.1) Load the CNN model trained on 3.4k instances:

In [1]:
import torch
import numpy as np
import torch.nn as nn

from pathlib import Path
from typing import *
from tqdm import tqdm
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules import FeedForward
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.iterators import DataIterator, BasicIterator
from allennlp.data import Instance
from allennlp.models import Model
from allennlp.nn import util as nn_util
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer # for ELMo

import models
import data_reader
import train

In [2]:
# Load vocabulary
vocab = Vocabulary.from_files("../saved_model/vocabulary")
print(vocab)

Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags'}



In [3]:
# Re-instantiate the CNN model:
label_cols = ["generalization", "neutral"] # Ordering is important
batch_size = 64
num_filters = 100
filter_sizes = (2,3,4,5)
num_classes = 2

word_embeddings = train.load_elmo_embeddings(large=True)

# CNN encoder
encoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                     num_filters=num_filters,
                     ngram_filter_sizes=filter_sizes)

# Feedforward:
classifier_feedforward = nn.Linear(encoder.get_output_dim(), num_classes)


model = models.Classifier(vocab=vocab,
                          word_embeddings=word_embeddings,
                          encoder=encoder,
                          classifier_feedforward=classifier_feedforward)

Loading LARGE ELMo..
Pre-trained ELMo loaded..


In [4]:
# Load model:
with open("../saved_model/cnn_elmo.th", 'rb') as f:
    model.load_state_dict(torch.load(f))

### 2.2) Predict on the test set (16.8k instances):

In [5]:
class Predictor:
    def __init__(self, model: Model, iterator: DataIterator, cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return out_dict["class_probabilities"]
    
    def predict(self, dataset: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(dataset, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(dataset))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)
    
def make_predictions(model, vocab, test_dataset, batch_size, use_gpu=False):
    """
    Runs the given 'model' on the given 'test_dataset' & returns predictions.
    """
    # iterate over the dataset without changing its order
    seq_iterator = BasicIterator(batch_size)
    seq_iterator.index_with(vocab)

    predictor = Predictor(model, seq_iterator, cuda_device=0 if use_gpu else -1)
    preds = predictor.predict(test_dataset)    
    return preds

def map_id_prediction(pred_probs, test_dataset):
    """
    Maps the ID to the corresponding prediction.
    
    'pred_probs' is predicted probabilities.
    Returns a dictionary with key = ID | value = prediction
    """
    out = {}
    for prediction, sample in zip(pred_probs, test_dataset):
        ID = sample.fields['ID'].metadata
        if prediction[0] >= 0.5: # because order is ['generalization', 'neutral']
            out[ID] = 'generalization'
        else:
            out[ID] = 'neutral'        
    return out

def tokenizer(x: str):
    return [w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x)]

In [6]:
# Load test dataset:
token_indexer = ELMoTokenCharactersIndexer()

reader = data_reader.GeneralizationDatasetReader(tokenizer=tokenizer, token_indexers={"tokens": token_indexer},
                                                 label_cols=label_cols)

DATA_ROOT = Path("../data/")
test_fname = './TEST_SET.csv'
test_dataset = reader.read(file_path=DATA_ROOT / test_fname)

16816it [00:22, 747.58it/s]


In [7]:
# Preview test set (labels should be empty)
print(test_dataset[0])
print("Label: ", vars(test_dataset[0]['label']))

Instance with fields:
 	 tokens: TextField of length 43 with text: 
 		[WHEN, ABEL, TASMAN, SET, OFF, ON, HIS, voyage, to, discover, the, fabled, Zuijdlandt, ,, or, South,
		Land, ,, he, carried, with, him, instructions, to, take, possession, of, all, continents, and,
		islands, which, you, should, discover, ,, call, at, or, set, foot, on, .]
 		and TokenIndexers : {'tokens': 'ELMoTokenCharactersIndexer'} 
 	 ID: MetadataField (print field.metadata to see specific information). 
 	 label: ArrayField with shape: (2,) and dtype: <class 'numpy.float32'>. 

Label:  {'array': array([nan, nan], dtype=object), 'padding_value': 0, 'dtype': <class 'numpy.float32'>}


In [8]:
# Predict:
preds = make_predictions(model, vocab, test_dataset, batch_size) # Note that 'preds' is of the shape (number of samples, 2) - the columns represent the probabilities for the two classes ['generalization', 'neutral']

# Map it to IDs:
id_pred = map_id_prediction(preds, test_dataset)

100%|██████████| 263/263 [1:22:27<00:00, 18.81s/it]


In [11]:
import pandas as pd

df = pd.read_csv('../data/TEST_SET.csv')

df['prediction'] = df['ID'].map(id_pred)
df.drop(['neutral', 'generalization'], axis=1, inplace=True)

print(df.shape)
df.head(2)

(16816, 5)


Unnamed: 0,sentences,filename,section,ID,prediction
0,WHEN ABEL TASMAN SET OFF ON HIS voyage to disc...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__0,neutral
1,1 These typically confident directives encapsu...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__1,generalization


In [12]:
## df.to_csv('../predictions/TEST_SET_predictions.csv', index=None)

In [20]:
# Check:
t = pd.read_csv('../predictions/TEST_SET_predictions.csv')
print(t.shape)
t.head()

(16816, 5)


Unnamed: 0,sentences,filename,section,ID,prediction
0,WHEN ABEL TASMAN SET OFF ON HIS voyage to disc...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__0,neutral
1,1 These typically confident directives encapsu...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__1,generalization
2,"Stepping ashore, frequently without pausing fo...",ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__2,neutral
3,"This was not done, at least in the first place...",ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__3,neutral
4,Tasman s instructions stipulated that he could...,ahr.2016.121.1.17_nonotes.txt,INTRO,ahr.2016.121.1.17_nonotes__INTRO__4,neutral


# fin.