# GAP heuristics

Taken "as is" from this [Public Kernel](https://www.kaggle.com/sattree/2-reproducing-gap-results) 

In [1]:
# change these if needed
!head ../input/test_stage_2.tsv > ../input/toy_train.tsv 
PATH_TO_TRAIN = '../input/toy_train.tsv' # dummy file, just in order not to change the code below
PATH_TO_TEST = '../input/test_stage_2.tsv'
PATH_OUT_TRAIN_FEAT = '../features/toy_train_gap_heuristics.tsv'
PATH_OUT_TEST_FEAT = '../features/test_gap_heuristics.tsv'

## 1. Download necessary models and install dependencies

In [2]:
%%time
# Download and install all dependencies
# gpr_pub contains the heuristics models and supplementary code
#!git clone https://github.com/sattree/gpr_pub.git
#!wget -P /home/kashn500/heavy_models/ http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
#!unzip /home/kashn500/heavy_models/stanford-corenlp-full-2018-10-05.zip
#!pip install allennlp --ignore-installed greenlet
#!pip install attrdict

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 12.2 µs


In [3]:
from sklearn.metrics import log_loss, classification_report
from attrdict import AttrDict

import spacy
import pandas as pd
from allennlp.predictors.predictor import Predictor
from allennlp.models.archival import load_archive
from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser

import sys
sys.path.append('../../')
from gpr_pub.utils import CoreNLPServer

# gap_scorer_ext has minor fixes for py3 and to take pandas df as input instead of filepaths
from gpr_pub.gap.gap_scorer_ext import read_annotations, calculate_scores, add_to_score_view

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## 2. Initialize models

In [4]:
# Heuristic models implement coref resolution based on heuristics described in the paper
# Pronoun resolution is a simple wrapper to convert coref predictions into class-specific labels
# Multi pass sieve model implements backoff mechanism
from gpr_pub.models.heuristics.random_distance import RandomModel
from gpr_pub.models.heuristics.token_distance import TokenDistanceModel
from gpr_pub.models.heuristics.syntactic_distance import StanfordSyntacticDistanceModel
from gpr_pub.models.heuristics.parallelism import AllenNLPParallelismModel as ParallelismModel
from gpr_pub.models.heuristics.url_title import StanfordURLTitleModel as URLModel

from gpr_pub.models.pronoun_resolution import PronounResolutionModel

from gpr_pub.models.multi_pass_sieve import MultiPassSieveModel

In [5]:
# Instantiate stanford corenlp server
STANFORD_CORENLP_PATH = 'stanford-corenlp-full-2018-10-05/'
server = CoreNLPServer(classpath=STANFORD_CORENLP_PATH,
                        corenlp_options=AttrDict({'port': 9090, 
                                                  'timeout': '600000', 
                                                  'quiet': 'true',
                                                  'preload': 'tokenize,spplit,lemma,parse,deparse'}))
server.start()
STANFORD_SERVER_URL = server.url

In [6]:
# !pip install cymem==1.31.2 spacy==2.0.12

In [7]:
# Instantiate base models
STANFORD_MODEL = CoreNLPParser(url=STANFORD_SERVER_URL)
SPACY_MODEL = spacy.load('en_core_web_lg')
model_url = 'https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz'
archive = load_archive(model_url, cuda_device=1)
ALLEN_DEP_MODEL = Predictor.from_archive(archive)

Did not use initialization regex that was passed: .*weight_ih.*
Did not use initialization regex that was passed: .*bias_hh.*
Did not use initialization regex that was passed: .*bias_ih.*
Did not use initialization regex that was passed: .*weight_hh.*


In [8]:
# Instantiate heuristic models
random_coref_model = RandomModel(SPACY_MODEL)
random_proref_model = PronounResolutionModel(random_coref_model)

token_distance_coref_model = TokenDistanceModel(SPACY_MODEL)
token_distance_proref_model = PronounResolutionModel(token_distance_coref_model)

syntactic_distance_coref_model = StanfordSyntacticDistanceModel(STANFORD_MODEL)
syntactic_distance_proref_model = PronounResolutionModel(syntactic_distance_coref_model, n_jobs=12)

parallelism_coref_model = ParallelismModel(ALLEN_DEP_MODEL, SPACY_MODEL)
parallelism_proref_model = PronounResolutionModel(parallelism_coref_model)

url_title_coref_model = URLModel(STANFORD_MODEL)
url_title_proref_model = PronounResolutionModel(url_title_coref_model, n_jobs=12)

## 3. Featurize train data

In [9]:
train_df = pd.read_csv(PATH_TO_TRAIN, sep='\t')
train_df.columns = map(lambda x: x.lower().replace('-', '_'), train_df.columns)

In [10]:
%%time
# Creates sieve pipeline of heuristic models, applying each new heuristic with appropriate backoff models
# Multi pass sieve - order of models provided as input is important
#    - left to right: recall increases
#    - right to left: precision increases
preds = MultiPassSieveModel(random_proref_model).predict(train_df)
score_df = add_to_score_view(preds, train_df, None, 'Random')

preds = MultiPassSieveModel(token_distance_proref_model).predict(train_df)
score_df = add_to_score_view(preds, train_df, score_df, 'Token Distance')

preds = MultiPassSieveModel(syntactic_distance_proref_model,
                           token_distance_proref_model).predict(train_df)
score_df = add_to_score_view(preds, train_df, score_df, 'Syntactic Distance')

preds = MultiPassSieveModel(parallelism_proref_model,
                            syntactic_distance_proref_model,
                           token_distance_proref_model).predict(train_df)
score_df = add_to_score_view(preds, train_df, score_df, 'Parallelism')

preds = MultiPassSieveModel(url_title_proref_model,
                            parallelism_proref_model,
                            syntactic_distance_proref_model,
                           token_distance_proref_model).predict(train_df)

100%|██████████| 9/9 [00:00<00:00, 34.26it/s]
100%|██████████| 9/9 [00:00<00:00, 43.85it/s]
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   6 out of   9 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=12)]: Done   9 out of   9 | elapsed:    1.0s finished
100%|██████████| 9/9 [00:00<00:00, 56.48it/s]
  0%|          | 0/9 [00:00<?, ?it/s]Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.
100%|██████████| 9/9 [00:03<00:00,  2.74it/s]
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   6 out of   9 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=12)]: Done   9 out of   9 | elapsed:    1.0s finished
100%|██████████| 9/9 [00:00<00:00, 52.09it/s]
[Parallel(n_jobs=12)]: Using b

CPU times: user 5min 26s, sys: 1.06 s, total: 5min 27s
Wall time: 13.8 s





In [11]:
#score_df = add_to_score_view(preds, train_df, score_df, 'Parallelism+URL')

In [12]:
len(preds)

9

In [13]:
y_pred_train = pd.DataFrame(preds, columns=['gap_A', 'gap_B']).astype('uint8')
# y_pred_train['gap_NEITHER'] = 1 - y_pred_train['gap_A'] - y_pred_train['gap_B']

In [14]:
y_pred_train.to_csv(PATH_OUT_TRAIN_FEAT, index=None, sep='\t')

## 4. Featurize test data

In [15]:
test_df = pd.read_csv(PATH_TO_TEST, sep='\t')
test_df.columns = map(lambda x: x.lower().replace('-', '_'), test_df.columns)

In [16]:
%%time
gap_test_preds = MultiPassSieveModel(url_title_proref_model,
                            parallelism_proref_model,
                            syntactic_distance_proref_model,
                           token_distance_proref_model).predict(test_df)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:   10.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   22.5s


08ecf48b1227bbf0166c60d47642f349, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   39.4s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:  1.0min
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:  1.5min


262b1f9572748fadaee4adab228604fb, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 2426 tasks      | elapsed:  2.0min
[Parallel(n_jobs=12)]: Done 3176 tasks      | elapsed:  2.6min


421c24e4bbea5be2eba1ae7ea8eca67a, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 4026 tasks      | elapsed:  3.3min
[Parallel(n_jobs=12)]: Done 4976 tasks      | elapsed:  4.0min
[Parallel(n_jobs=12)]: Done 6026 tasks      | elapsed:  4.8min


8252a7df396e6a6800eaf4dc829e20e1, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 7176 tasks      | elapsed:  5.8min
[Parallel(n_jobs=12)]: Done 8426 tasks      | elapsed:  6.7min
[Parallel(n_jobs=12)]: Done 9776 tasks      | elapsed:  7.8min


d9c3a3a20d16502e4f621ef84676b5ce, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 11226 tasks      | elapsed:  9.0min
[Parallel(n_jobs=12)]: Done 12359 out of 12359 | elapsed:  9.9min finished
  1%|          | 70/12359 [00:26<1:11:04,  2.88it/s]

Dependency parse and tokenizer tokens dont match.


  3%|▎         | 364/12359 [02:17<1:17:04,  2.59it/s]

Dependency parse and tokenizer tokens dont match.


  7%|▋         | 807/12359 [05:07<1:19:58,  2.41it/s]

Dependency parse and tokenizer tokens dont match.


 10%|▉         | 1205/12359 [07:35<1:06:38,  2.79it/s]

Dependency parse and tokenizer tokens dont match.


 10%|█         | 1297/12359 [08:08<1:09:35,  2.65it/s]

Dependency parse and tokenizer tokens dont match.


 11%|█         | 1353/12359 [08:31<1:15:06,  2.44it/s]

Dependency parse and tokenizer tokens dont match.


 13%|█▎        | 1575/12359 [09:58<1:01:27,  2.92it/s]

Dependency parse and tokenizer tokens dont match.


 13%|█▎        | 1618/12359 [10:14<1:18:17,  2.29it/s]

Dependency parse and tokenizer tokens dont match.


 13%|█▎        | 1640/12359 [10:23<1:04:56,  2.75it/s]

Dependency parse and tokenizer tokens dont match.


 14%|█▎        | 1674/12359 [10:37<1:08:26,  2.60it/s]

Dependency parse and tokenizer tokens dont match.


 15%|█▍        | 1851/12359 [11:43<1:13:39,  2.38it/s]

Dependency parse and tokenizer tokens dont match.


 18%|█▊        | 2263/12359 [14:21<1:01:57,  2.72it/s]

Dependency parse and tokenizer tokens dont match.


 20%|█▉        | 2448/12359 [15:33<1:04:47,  2.55it/s]

Dependency parse and tokenizer tokens dont match.


 24%|██▍       | 3000/12359 [19:04<56:20,  2.77it/s]  

Dependency parse and tokenizer tokens dont match.


 28%|██▊       | 3407/12359 [21:37<53:24,  2.79it/s]  

Dependency parse and tokenizer tokens dont match.


 28%|██▊       | 3442/12359 [21:51<1:06:16,  2.24it/s]

Dependency parse and tokenizer tokens dont match.


 33%|███▎      | 4086/12359 [25:56<54:07,  2.55it/s]  

Dependency parse and tokenizer tokens dont match.


 41%|████      | 5088/12359 [32:19<45:47,  2.65it/s]  

Dependency parse and tokenizer tokens dont match.


 43%|████▎     | 5282/12359 [33:38<46:11,  2.55it/s]  

Dependency parse and tokenizer tokens dont match.


 48%|████▊     | 5886/12359 [37:29<38:28,  2.80it/s]

Dependency parse and tokenizer tokens dont match.


 54%|█████▍    | 6728/12359 [43:02<38:37,  2.43it/s]

Dependency parse and tokenizer tokens dont match.


 55%|█████▌    | 6805/12359 [43:32<32:39,  2.83it/s]

Dependency parse and tokenizer tokens dont match.


 56%|█████▌    | 6941/12359 [44:24<33:23,  2.70it/s]

Dependency parse and tokenizer tokens dont match.


 57%|█████▋    | 7031/12359 [44:57<35:47,  2.48it/s]

Dependency parse and tokenizer tokens dont match.


 58%|█████▊    | 7130/12359 [45:34<42:30,  2.05it/s]

Dependency parse and tokenizer tokens dont match.


 58%|█████▊    | 7210/12359 [46:10<41:27,  2.07it/s]

Dependency parse and tokenizer tokens dont match.


 59%|█████▉    | 7286/12359 [46:42<33:05,  2.56it/s]

Dependency parse and tokenizer tokens dont match.


 59%|█████▉    | 7344/12359 [47:05<36:17,  2.30it/s]

Dependency parse and tokenizer tokens dont match.


 62%|██████▏   | 7625/12359 [48:49<24:50,  3.18it/s]

Dependency parse and tokenizer tokens dont match.


 66%|██████▌   | 8174/12359 [52:18<33:06,  2.11it/s]

Dependency parse and tokenizer tokens dont match.


 77%|███████▋  | 9557/12359 [1:01:38<24:08,  1.93it/s]

Dependency parse and tokenizer tokens dont match.


 78%|███████▊  | 9596/12359 [1:01:53<16:46,  2.75it/s]

Dependency parse and tokenizer tokens dont match.


 84%|████████▍ | 10359/12359 [1:06:53<14:05,  2.37it/s]

Dependency parse and tokenizer tokens dont match.


 86%|████████▌ | 10618/12359 [1:08:37<11:39,  2.49it/s]

Dependency parse and tokenizer tokens dont match.


 86%|████████▌ | 10654/12359 [1:08:50<14:41,  1.93it/s]

Dependency parse and tokenizer tokens dont match.


 88%|████████▊ | 10904/12359 [1:10:30<10:12,  2.38it/s]

Dependency parse and tokenizer tokens dont match.


 89%|████████▉ | 11000/12359 [1:11:10<10:36,  2.14it/s]

Dependency parse and tokenizer tokens dont match.


 89%|████████▉ | 11015/12359 [1:11:16<08:47,  2.55it/s]

Dependency parse and tokenizer tokens dont match.


 91%|█████████ | 11264/12359 [1:12:53<06:10,  2.95it/s]

Dependency parse and tokenizer tokens dont match.


 91%|█████████▏| 11296/12359 [1:13:04<06:02,  2.93it/s]

Dependency parse and tokenizer tokens dont match.


 92%|█████████▏| 11376/12359 [1:13:35<05:55,  2.77it/s]

Dependency parse and tokenizer tokens dont match.


100%|██████████| 12359/12359 [1:20:15<00:00,  2.49it/s]
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    8.5s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   18.7s


08ecf48b1227bbf0166c60d47642f349, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   35.2s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   57.2s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:  1.4min


262b1f9572748fadaee4adab228604fb, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 2426 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done 3176 tasks      | elapsed:  2.5min


421c24e4bbea5be2eba1ae7ea8eca67a, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 4026 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done 4976 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done 6026 tasks      | elapsed:  4.7min


8252a7df396e6a6800eaf4dc829e20e1, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 7176 tasks      | elapsed:  5.6min
[Parallel(n_jobs=12)]: Done 8426 tasks      | elapsed:  6.6min
[Parallel(n_jobs=12)]: Done 9776 tasks      | elapsed:  7.7min


d9c3a3a20d16502e4f621ef84676b5ce, Tokens in parse tree and input sentence don't match.


[Parallel(n_jobs=12)]: Done 11226 tasks      | elapsed:  8.8min
[Parallel(n_jobs=12)]: Done 12359 out of 12359 | elapsed:  9.7min finished
100%|██████████| 12359/12359 [03:33<00:00, 57.81it/s]

CPU times: user 2d 1h 53min 2s, sys: 2min 16s, total: 2d 1h 55min 18s
Wall time: 1h 43min 23s





In [17]:
y_pred_test = pd.DataFrame(gap_test_preds, columns=['gap_A', 'gap_B']).astype('uint8')
# y_pred_test['gap_NEITHER'] = 1 - y_pred_test['gap_A'] - y_pred_test['gap_B']

In [18]:
y_pred_test.to_csv(PATH_OUT_TEST_FEAT, index=None, sep='\t')