# extract ScienceIE data
- Training set: https://drive.google.com/open?id=0B2Z1kbILu3YtYjkwMHd3TmNPWDQ
- Development set: https://drive.google.com/open?id=0B2Z1kbILu3YtNDE1R0h5c2tQclU
- Test set: https://drive.google.com/open?id=0B2Z1kbILu3YtMUlfaWZDN0FSUms

Place scienceie2017_dev.zip, scienceie2017_train.zip, and semeval_articles_test.zip in repo root.

In [1]:
# !mkdir data/temp
# !unzip -o ./scienceie2017_dev.zip -d data/temp
# !unzip -o ./scienceie2017_train.zip -d data/temp
# !unzip -o ./semeval_articles_test.zip -d data/temp
# !mv data/temp/semeval_articles_test data/test
# !mv data/temp/train2 data/train
# !mv data/temp/dev data/dev
# !rm -rf data/temp

## convert annotations to BIO (tag unannotated words as O)

In [1]:
import os
import codecs

def convert_anns(path):
    annfiles = [f for f in os.listdir(path) if f.endswith('.ann')]
    
    tag_out = []
    
    for af in annfiles:
        
        annfile = os.path.join(path, af)
        txtfile = annfile.replace(".ann", ".txt")
        paragraph_ = codecs.open(txtfile, "r", "utf-8")
        paragraph = paragraph_.read()
        paragraph = paragraph.strip()

        lastind = 0
        last_offset = 0
        
        with open(annfile) as f:
            for line in f:
                if line.startswith('T'):
                    
                    lsplit = line.split('\t')
                    tid, t_cinds, t_words = lsplit
                    t_words=t_words.strip()
                    try:
                        tsplit = t_cinds.split(' ')
                        tag = tsplit[0]
                        start = tsplit[1]
                        end = tsplit[-1]
                    except:
                        print t_cinds
                        print annfile
                    start=int(start)
                    end=int(end)
                    if start!=lastind:
                        owords=paragraph[lastind:start+last_offset].split(' ')
                        for oword in owords:
                            if oword.strip()!='':
                                tag_out+=[oword +' O']
                    lastind=end+last_offset
                    offset = 0
                    while(not paragraph[(start+offset):(end+offset)]==t_words):
                        if offset>50:
                            break
                        offset+=1
                    last_offset = offset
                    words=paragraph[(start+offset):(end+offset)].split(' ')
                    for wind in range(len(words)):
                        if words[wind].strip() == '':
                                next
                        else:
                            if wind==0:
                                tag_out+=[words[wind] +' B-'+tag]
                            else:
                                tag_out+=[words[wind] +' I-'+tag]

        owords=paragraph[lastind+1+offset:len(paragraph)].split(' ')
        for oword in owords:
                if oword.strip()!='':
                    tag_out+=[oword +' O']
        tag_out+=['']
                    
#     with open(path+'.txt', 'w') as out:
    with codecs.open(path+'.txt', 'w', 'utf-8') as out:
        out.write('\n'.join(tag_out))
   

In [2]:
for p in ['data/dev', 'data/train', 'data/test']:
# for p in ['data/test']:
    convert_anns(p)

OSError: [Errno 2] No such file or directory: 'data/dev'

In [3]:
!wc -l data/dev.txt
!wc -l data/train.txt
!wc -l data/test.txt

20131 data/dev.txt
106918 data/train.txt
37755 data/test.txt


In [8]:
#### process full texts from training set for subword embeddings
import nltk
import codecs

import sys
stdout=sys.stdout
reload(sys)
sys.setdefaultencoding('utf8')
sys.stdout=stdout

from model.xml_utils import parseXML


def convert_xmls(path):
    xmlfiles = [f for f in os.listdir(path) if f.endswith('.xml')]
    doclist = []
    for f in xmlfiles:
        _, _, sents = parseXML(os.path.join(path, f))
        doclist+=nltk.sent_tokenize(sents)
        
    with codecs.open('data/'+os.path.basename(path)+'_full.txt', 'w', 'utf-8') as out:
        out.write('\n'.join(doclist))


In [5]:
convert_xmls('data/train')

In [6]:
convert_xmls('data/dev')

# HMM assignment 5 formatting

In [4]:
import codecs

corpus=[]
current_sent=[]
last_token=''
err='none'

with codecs.open("data/test.txt", 'r', 'utf-8') as f:
    for line in f.readlines():
        if not line in [u'\n', u'\r\n']:
            token, tag = line.split(' ')
            if last_token.endswith('.') and not last_token in ['e.g.', 'i.e.']:
                if token[0].isupper():
                    corpus.append(current_sent)
                    current_sent=[]
            current_sent.append((token, tag.strip()))
        else:
            corpus.append(current_sent)
            current_sent=[]
    corpus.append(current_sent)


In [9]:
import os
# os.chdir('../2017-fall-main/assignment/a5/part1')

import pos
import nltk

hmm = pos.HMM()
for sentence in corpus:
    hmm.update_counts(sentence)
hmm.compute_logprobs()

def pretty_print_fb(sentence):
    print sentence
    print hmm.forward_backward(sentence.split())
    
pretty_print_fb(' '.join([i for i,j in corpus[0]]))

This paper proposes a sentence stress feedback system in which sentence stress prediction, detection, and feedback provision models are combined. This system provides non-native learners with feedback on sentence stress errors so that they can improve their English rhythm and fluency in a self-study setting. The sentence stress feedback system was devised to predict and detect the sentence stress of any practice sentence. The accuracy of the prediction and detection models was 96.6% and 84.1%, respectively. The stress feedback provision model offers positive or negative stress feedback for each spoken word by comparing the probability of the predicted stress pattern with that of the detected stress pattern. In an experiment that evaluated the educational effect of the proposed system incorporated in our CALL system sentence stress feedback system in which sentence stress prediction, detection, and feedback provision models are combined. This system provides non-native learners with fee

# acquire GloVe vectors

In [5]:
# !wget -P ./data/ "http://nlp.stanford.edu/data/glove.6B.zip"
# !unzip ./data/glove.6B.zip -d data/glove.6B/
# !rm ./data/glove.6B.zip

--2017-12-15 02:38:37--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2017-12-15 02:38:37--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘./data/glove.6B.zip’


2017-12-15 02:38:58 (39.3 MB/s) - ‘./data/glove.6B.zip’ saved [862182613/862182613]

Archive:  ./data/glove.6B.zip
  inflating: data/glove.6B/glove.6B.50d.txt  
  inflating: data/glove.6B/glove.6B.100d.txt  
  inflating: data/glove.6B/glove.6B.200d.txt  
  inflating: data/glove.6B/glove.6B.300d.txt  


In [5]:
# !wget -P ./data/ "http://nlp.stanford.edu/data/glove.840B.300d.zip"
!unzip ./data/glove.840B.300d.zip -d data/glove.840B/
!rm ./data/glove.840B.zip

Archive:  ./data/glove.840B.300d.zip
  inflating: data/glove.840B/glove.840B.300d.txt  
rm: cannot remove './data/glove.840B.zip': No such file or directory


## construct subword vectors from fastText

In [7]:
# !git clone https://github.com/facebookresearch/fastText.git fasttext
!conda install cmake
!conda install -c conda-forge pybind11

Fetching package metadata ...........
Solving package specifications: .

Package plan for installation in environment /home/beatspace9/anaconda2:

The following NEW packages will be INSTALLED:

    bzip2:        1.0.6-h6d464ef_2             
    cmake:        3.9.4-h142f0e9_0             
    libgcc-ng:    7.2.0-h7cc24e2_2             
    libprotobuf:  3.4.1-h5b8497f_0             
    libstdcxx-ng: 7.2.0-h7a57d05_2             
    libuv:        1.14.0-h56b52c2_0            
    ncurses:      6.0-h9df7e31_2               
    rhash:        1.3.5-hbf7ad62_1             

The following packages will be UPDATED:

    conda:        4.3.29-py27_0     conda-forge --> 4.3.30-py27h6ae6dc7_0
    conda-env:    2.6.0-0           conda-forge --> 2.6.0-h36134e3_1     
    protobuf:     3.4.0-py27_0      conda-forge --> 3.4.1-py27h2ba6a9c_0 
    xz:           5.2.2-1                       --> 5.2.3-h55aa19d_2     
    zlib:         1.2.8-3                       --> 1.2.11-ha838bed_2    

Proceed (

In [28]:
!cd fasttext
!mkdir build && cd build && cmake ..
!make && make install

In [70]:
# !cd ../data
# !mkdir subwords
# !./fasttext skipgram -input fulltxts.txt -output subwords/fasttext.100d.txt -dim 100
# !./fasttext skipgram -input fulltxts.txt -output subwords/fasttext.300d.txt -dim 300
# !sed 1,1d subwords/fasttext.100d.txt > subwords/fasttext.100d.txt
# !sed 1,1d subwords/fasttext.300d.txt > subwords/fasttext.300d.txt

terminate called after throwing an instance of 'std::invalid_argument'
  what():  fulltxts.txt cannot be opened for training!
Aborted (core dumped)


## extract vocab from data and matching GloVe vectors
Bi-LSTM code adapted from https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html

In [1]:
from model.config import Config
from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
    get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
    export_trimmed_glove_vectors, get_processing_word
    
config = Config(load=False)

### setup variables

In [2]:
config.dim_word = 300
config.dim_char = 100
config.dim_subword = 300

# config.filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(config.dim_word)
# config.filename_trimmed = "data/glove.6B.{}d.trimmed.npz".format(config.dim_word)
# config.use_pretrained = True

config.filename_glove = "data/glove.840B/glove.840B.{}d.txt".format(config.dim_word)
config.filename_trimmed = "data/glove.840B.{}d.trimmed.npz".format(config.dim_word)
config.use_pretrained = True

config.filename_subwords = "data/subwords/fasttext.{}d.txt".format(config.dim_subword)
config.filename_swtrimmed = "data/fasttext.{}d.trimmed.npz".format(config.dim_subword)
config.use_subwords = True

# dataset
config.filename_dev = "data/dev.txt"
config.filename_test = "data/test.txt"
config.filename_train = "data/train.txt"
# config.filename_dev = config.filename_test = config.filename_train = "data/test.txt" # test

max_iter = None # if not None, max number of examples in Dataset

config.filename_words = "data/words.txt"
config.filename_tags = "data/tags.txt"
config.filename_chars = "data/chars.txt"

### model parameters

In [3]:
# training
config.train_embeddings = False
config.nepochs          = 15
config.dropout          = 0.5
config.batch_size       = 20
config.lr_method        = "adam"
config.lr               = 0.001
config.lr_decay         = 0.9
config.clip             = -1 # if negative, no clipping
config.nepoch_no_imprv  = 3

# model hyperparameters
config.hidden_size_char = 100 # lstm on chars
config.hidden_size_lstm = 300 # lstm on word embeddings

config.use_crf = False
config.use_chars = True
config.use_glove = True

# character CNN parameters
# config.feature_maps=[50, 100, 150, 200, 200, 200, 200]
# config.kernels=[1,2,3,4,5,6,7]
# config.num_filters=128
# config.filter_sizes=[3,5,8]


In [4]:
processing_word = get_processing_word(lowercase=True)

# Generators
dev   = CoNLLDataset(config.filename_dev, processing_word)
test  = CoNLLDataset(config.filename_test, processing_word)
train = CoNLLDataset(config.filename_train, processing_word)

# Build Word and Tag vocab
vocab_words, vocab_tags = get_vocabs([train, dev, test])
vocab_glove = get_glove_vocab(config.filename_glove)
vocab_subwords = get_glove_vocab(config.filename_subwords)

vocab = vocab_words & vocab_glove & vocab_subwords
vocab.add(UNK)
vocab.add(NUM)

# Save vocab
write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)

# Trim GloVe Vectors
vocab = load_vocab(config.filename_words)
export_trimmed_glove_vectors(vocab, config.filename_glove,
                            config.filename_trimmed, config.dim_word)
export_trimmed_glove_vectors(vocab, config.filename_subwords,
                            config.filename_swtrimmed, config.dim_subword)



Building vocab...
- done. 15346 tokens
Building vocab...
- done. 2196016 tokens
Building vocab...
- done. 18614 tokens
Writing vocab...
- done. 6205 tokens
Writing vocab...
- done. 7 tokens


In [5]:
# Build and save char vocab
train = CoNLLDataset(config.filename_train)
vocab_chars = get_char_vocab(train)
write_vocab(vocab_chars, config.filename_chars)

Writing vocab...
- done. 154 tokens


### build model

In [6]:
from model.ner_model import NERModel

config.load()

model = NERModel(config)
model.build()
# model.restore_session("results/crf/model.weights/") # optional, restore weights
# model.reinitialize_weights("proj")

# create datasets
dev   = CoNLLDataset(config.filename_dev, config.processing_word,
                     config.processing_tag, config.max_iter)
train = CoNLLDataset(config.filename_train, config.processing_word,
                     config.processing_tag, config.max_iter)

# train model
model.train(train, dev)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Initializing tf session
Epoch 1 out of 15




acc 85.41 - f1 0.00 - precision 0.00
- new best score!
Epoch 2 out of 15




acc 85.44 - f1 0.00 - precision 0.00
- new best score!
Epoch 3 out of 15




acc 85.58 - f1 3.15 - precision 51.69
- new best score!
Epoch 4 out of 15




acc 86.08 - f1 12.75 - precision 50.00
- new best score!
Epoch 5 out of 15




acc 86.51 - f1 18.02 - precision 51.07
- new best score!
Epoch 6 out of 15




acc 86.65 - f1 22.83 - precision 48.07
- new best score!
Epoch 7 out of 15




acc 86.86 - f1 24.49 - precision 50.11
- new best score!
Epoch 8 out of 15




acc 87.02 - f1 26.59 - precision 51.42
- new best score!
Epoch 9 out of 15




acc 87.17 - f1 27.51 - precision 52.75
- new best score!
Epoch 10 out of 15




acc 87.27 - f1 29.62 - precision 53.05
- new best score!
Epoch 11 out of 15




acc 87.27 - f1 30.59 - precision 51.20
- new best score!
Epoch 12 out of 15




acc 87.31 - f1 31.39 - precision 51.57
- new best score!
Epoch 13 out of 15




acc 87.33 - f1 31.62 - precision 51.72
- new best score!
Epoch 14 out of 15




acc 87.33 - f1 31.74 - precision 51.63
- new best score!
Epoch 15 out of 15




acc 87.31 - f1 33.22 - precision 50.39
- new best score!


# evaluation

In [7]:
test  = CoNLLDataset(config.filename_test, config.processing_word,
                         config.processing_tag, config.max_iter)

model.evaluate(test)

Testing model over test set
acc 87.49 - f1 23.16 - precision 37.23


In [8]:
from model.data_utils import CoNLLDataset
from model.ner_model import NERModel
from model.config import Config


def align_data(data):
    """Given dict with lists, creates aligned strings
    Adapted from Assignment 3 of CS224N
    Args:
        data: (dict) data["x"] = ["I", "love", "you"]
              (dict) data["y"] = ["O", "O", "O"]
    Returns:
        data_aligned: (dict) data_align["x"] = "I love you"
                           data_align["y"] = "O O    O  "
    """
    spacings = [max([len(seq[i]) for seq in data.values()])
                for i in range(len(data[list(data.keys())[0]]))]
    data_aligned = dict()

    # for each entry, create aligned string
    for key, seq in data.items():
        str_aligned = ""
        for token, spacing in zip(seq, spacings):
            str_aligned += token + " " * (spacing - len(token) + 1)

        data_aligned[key] = str_aligned

    return data_aligned



def tag_text(model, text):
    words_raw = text.strip().split(" ")

    preds = model.predict(words_raw)
    to_print = align_data({"input": words_raw, "output": preds})

#     for key, seq in to_print.items():
#         model.logger.info(seq)
    return words_raw, preds
        
# test_sent="First results from RHIC on charged multiplicities, evolution of multiplicities with centrality, particle ratios and transverse momentum distributions in central and minimum bias collisions, are analyzed in a string model which includes hard collisions, collectivity in the initial state considered as string fusion, and rescattering of the produced secondaries. Multiplicities and their evolution with centrality are successfully reproduced. Transverse momentum distributions in the model show a larger pT-tail than experimental data, disagreement which grows with increasing centrality. Discrepancies with particle ratios appear and are examined comparing with previous features of the model at SPS."

In [9]:
# !wget https://gist.githubusercontent.com/onepunchdan/bf83950297e9b8ec8a50d33d3a1ecf59/raw/0bea2b99f76e751606bbf08972b91ce75e1dab3a/gistfile1.txt

In [10]:
import codecs
ps=[]

with codecs.open("gistfile1.txt", 'r', 'utf-8') as f:
    for l in f.readlines():
        if l.strip()!='':
            ps.append(l.strip())

In [11]:
best_sent=u'Recent astronomical observations of high redshift type Ia supernovae performed by two groups [1\u20133] as well as the power spectrum of the cosmic microwave background radiation obtained by the BOOMERANG [4] and MAXIMA-1 [5] experiments seem to indicate that at present the Universe is in a state of accelerated expansion. If one analyzes these data within the Friedmann\u2013Robertson\u2013Walker (FRW) standard model of cosmology their most natural interpretation is that the Universe is spatially flat and that the (baryonic plus dark) matter density \u03c1 is about one third of the critical density \u03c1crit. Most interestingly, the dominant contribution to the energy density is provided by the cosmological constant \u039b. The vacuum energy density (1.1)\u03c1\u039b\u2261\u039b/(8\u03c0G) is about twice as large as \u03c1, i.e., about two thirds of the critical density. With \u03a9M\u2261\u03c1/\u03c1crit, \u03a9\u039b\u2261\u03c1\u039b/\u03c1crit and \u03a9tot\u2261\u03a9M+\u03a9\u039b: (1.2)\u03a9M\u22481/3,\u03a9\u039b\u22482/3,\u03a9tot\u22481. This implies that the deceleration parameter q is approximately \u22121/2. While originally the cosmological constant problem [6] was related to the question why \u039b is so unnaturally small, the discovery of the important role played by \u03c1\u039b has shifted the emphasis toward the \u201ccoincidence problem\u201d, the question why \u03c1 and \u03c1\u039b happen to be of the same order of magnitude precisely at this very moment [7].'

In [12]:
tag_text(model, best_sent)

([u'Recent',
  u'astronomical',
  u'observations',
  u'of',
  u'high',
  u'redshift',
  u'type',
  u'Ia',
  u'supernovae',
  u'performed',
  u'by',
  u'two',
  u'groups',
  u'[1\u20133]',
  u'as',
  u'well',
  u'as',
  u'the',
  u'power',
  u'spectrum',
  u'of',
  u'the',
  u'cosmic',
  u'microwave',
  u'background',
  u'radiation',
  u'obtained',
  u'by',
  u'the',
  u'BOOMERANG',
  u'[4]',
  u'and',
  u'MAXIMA-1',
  u'[5]',
  u'experiments',
  u'seem',
  u'to',
  u'indicate',
  u'that',
  u'at',
  u'present',
  u'the',
  u'Universe',
  u'is',
  u'in',
  u'a',
  u'state',
  u'of',
  u'accelerated',
  u'expansion.',
  u'If',
  u'one',
  u'analyzes',
  u'these',
  u'data',
  u'within',
  u'the',
  u'Friedmann\u2013Robertson\u2013Walker',
  u'(FRW)',
  u'standard',
  u'model',
  u'of',
  u'cosmology',
  u'their',
  u'most',
  u'natural',
  u'interpretation',
  u'is',
  u'that',
  u'the',
  u'Universe',
  u'is',
  u'spatially',
  u'flat',
  u'and',
  u'that',
  u'the',
  u'(baryonic',
  u

In [13]:
plist=[]
for p in ps:
    words, preds = tag_text(model, p)
    tagnum=sum([1 for i in preds if i!='O'])
    plist.append([tagnum, words])


  if char in vocab_chars:


In [14]:
tag_text(model, ' '.join(max(plist)[1]))

([u'A',
  u'hydroxyl-functionalized',
  u'poly(butylene',
  u'succinate)',
  u'based',
  u'polyester',
  u'was',
  u'prepared',
  u'by',
  u'conventional',
  u'polycondensation',
  u'of',
  u'benzyl-protected',
  u'dimethyl',
  u'malonate',
  u'and',
  u'1,4-butanediol',
  u'(Scheme',
  u'2(a))',
  u'[24a].',
  u'Yao',
  u'et\xa0al.',
  u'reported',
  u'on',
  u'the',
  u'direct',
  u'polycondensation',
  u'of',
  u'l-lactic',
  u'acid',
  u'and',
  u'citric',
  u'acid',
  u'with',
  u'the',
  u'formation',
  u'of',
  u'poly[(l-lactic',
  u'acid)-co-(citric',
  u'acid)],',
  u'obtaining',
  u'a',
  u'polyester',
  u'oligomer',
  u'with',
  u'both',
  u'pendant',
  u'carboxylic',
  u'and',
  u'hydroxyl',
  u'groups',
  u'[24b].',
  u'This',
  u'PLCA',
  u'oligomer',
  u'was',
  u'reacted',
  u'with',
  u'dihydroxylated',
  u'PLLA',
  u'as',
  u'a',
  u'macromonomer,',
  u'yielding',
  u'a',
  u'PLCA\u2013PLLA',
  u'multiblock',
  u'copolymer',
  u'as',
  u'shown',
  u'in',
  u'Scheme',
