## Discourse segmentation

(Customizes the multilingual RST segmentation system introduced in https://www.aclweb.org/anthology/W19-2715/ )

Create new train and test sets for Russian; make up new models configs and modify train/evaluation scripts.
 
Output:

 - models/segmenter/rus.rst.rrt_train{.conll, .conll2003}
 - models/segmenter/rus.rst.rrt_dev{.conll, .conll2003}
 - models/segmenter/rus.rst.rrt_test{.conll, .conll2003}
 - configs & scripts for segmentation models
 - models/segmenter/.../model.tar.gz

In [None]:
import glob
import pandas as pd

### 1. Prepare dataset for model training and evaluation 

In [None]:
from utils.file_reading import SYMBOL_MAP

def prepare_token(token):
    for key, value in SYMBOL_MAP.items():
        token = token.replace(key, value)
        
    return token

In [None]:
from isanlp.utils.annotation_conll_converter import AnnotationCONLLConverter

converter = AnnotationCONLLConverter()

In [None]:
from isanlp.annotation import Token, Sentence


def split_by_paragraphs(annot_text, annot_tokens, annot_lemma, annot_morph, annot_postag,
                        annot_syntax_dep_tree):

        def split_on_two(sents, boundary):
            list_sum = lambda l: sum([len(sublist) for sublist in l])

            i = 1
            while list_sum(sents[:i]) < boundary and i < len(sents):
                i += 1

            intersentence_boundary = min(len(sents[i - 1]), boundary - list_sum(sents[:i - 1]))
            return (sents[:i - 1] + [sents[i - 1][:intersentence_boundary]],
                    [sents[i - 1][intersentence_boundary:]] + sents[i:])

        def recount_sentences(chunk):
            sentences = []
            lemma = []
            morph = []
            postag = []
            syntax_dep_tree = []
            tokens_cursor = 0

            for i, sent in enumerate(chunk['syntax_dep_tree']):
                if len(sent) > 0:
                    sentences.append(Sentence(tokens_cursor, tokens_cursor + len(sent)))
                    lemma.append(chunk['lemma'][i])
                    morph.append(chunk['morph'][i])
                    postag.append(chunk['postag'][i])
                    syntax_dep_tree.append(chunk['syntax_dep_tree'][i])
                    tokens_cursor += len(sent)

            chunk['sentences'] = sentences
            chunk['lemma'] = lemma
            chunk['morph'] = morph
            chunk['postag'] = postag
            chunk['ud_postag'] = postag
            chunk['syntax_dep_tree'] = syntax_dep_tree

            return chunk

        chunks = []
        prev_right_boundary = -1

        for i, token in enumerate(annot_tokens):

            if i < len(annot_tokens)-1 and '\n' in annot_text[token.end:annot_tokens[i + 1].begin]:
                if prev_right_boundary > -1:
                    chunk = {
                        'text': annot_text[annot_tokens[prev_right_boundary].end:token.end + 1].strip(),
                        'tokens': annot_tokens[prev_right_boundary + 1:i + 1]
                    }
                else:
                    chunk = {
                        'text': annot_text[:token.end + 1].strip(),
                        'tokens': annot_tokens[:i + 1]
                    }

                lemma, annot_lemma = split_on_two(annot_lemma, i - prev_right_boundary)
                morph, annot_morph = split_on_two(annot_morph, i - prev_right_boundary)
                postag, annot_postag = split_on_two(annot_postag, i - prev_right_boundary)
                syntax_dep_tree, annot_syntax_dep_tree = split_on_two(annot_syntax_dep_tree, i - prev_right_boundary)

                chunk.update({
                    'lemma': lemma,
                    'morph': morph,
                    'postag': postag,
                    'ud_postag': postag,
                    'syntax_dep_tree': syntax_dep_tree,
                })
                chunks.append(recount_sentences(chunk))

                prev_right_boundary = i  # number of last token in the last chunk

        chunk = {
            'text': annot_text[annot_tokens[prev_right_boundary].end:].strip(),
            'tokens': annot_tokens[prev_right_boundary + 1:],
            'lemma': annot_lemma,
            'morph': annot_morph,
            'postag': annot_postag,
            'ud_postag': annot_postag,
            'syntax_dep_tree': annot_syntax_dep_tree,
        }

        chunks.append(recount_sentences(chunk))
        return chunks
    

def annot2tags(annot, edus):
    tags = []
    cursor = 0
    from_prev_begin_to_here = ''

    for sentence in range(len(annot['sentences'])):
        sentence_tags = []
        previous_first_token = 0
        previous_edu = ''

        for token in range(annot['sentences'][sentence].begin, annot['sentences'][sentence].end):
            is_first_token = False
            token_text = prepare_token(annot['tokens'][token].text).strip()  # Look at the current token
            
            if cursor != len(edus):              
                tmp_edu = prepare_token(edus[cursor]).strip()  # Look at the current EDU
                # token_text = annot['text'][annot['tokens'][token].begin:annot['tokens'][token].end].strip()

                if tmp_edu.startswith(token_text):
                    if previous_edu:
                        # from_prev_begin_to_here = prepare_token(
                        #     annot['text'][annot['tokens'][previous_first_token].begin:annot['tokens'][token].begin].strip())
                        # print('***', ''.join(from_prev_begin_to_here.strip().split()), '***', ''.join(previous_edu.strip().split()))
                        
                        if len(''.join(from_prev_begin_to_here.strip().split())) == len(''.join(previous_edu.strip().split())):
                            is_first_token = True
                            from_prev_begin_to_here = token_text
                            previous_first_token = token
                            previous_edu = tmp_edu
                            cursor += 1
                        else:
                            from_prev_begin_to_here += token_text
                    else:
                        is_first_token = True
                        from_prev_begin_to_here = token_text
                        previous_first_token = token
                        previous_edu = tmp_edu
                        cursor += 1
                else:
                    from_prev_begin_to_here += token_text

            tag = 'BeginSeg=Yes' if is_first_token else '_'
            sentence_tags.append(tag)

        tags.append(sentence_tags)

    return tags

In [None]:
from utils.train_test_split import split_rstreebank, split_essays

print('Loading RSTreebank:')
train, dev, test = split_rstreebank('./data_ru')
print('Train length:', len(train), 'Dev length:', len(dev), 'Test length:', len(test), '(files)')

print('\nLoading Essays:')
train_dep, dev_dep, test_dep = split_essays('./dep_data')
print('Train length:', len(train_dep), 'Dev length:', len(dev_dep), 'Test length:', len(test_dep), '(files)')

train += train_dep
dev += dev_dep
test += test_dep
print('\nOverall\nTrain length:', len(train), 'Dev length:', len(dev), 'Test length:', len(test), '(files)')

In [None]:
import os
from glob import glob
from tqdm import tqdm
from utils.file_reading import read_annotation, read_edus
import re

output_dir = 'models/segmenter'
! rm -r $output_dir
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
    
TRAIN_FILE = os.path.join(output_dir, 'rus.rst.rrt_train.conll')
DEV_FILE = os.path.join(output_dir, 'rus.rst.rrt_dev.conll')
TEST_FILE = os.path.join(output_dir, 'rus.rst.rrt_test.conll')
MAX_LEN = 500

def preprocess(files, subset='train'):
    print(f'preprocess {subset} set')
    
    if subset == 'train':
        output_file = TRAIN_FILE
    else:
        output_file = DEV_FILE if subset=='dev' else TEST_FILE
    
    with open(output_file, 'w') as fo:
        for filename in tqdm(files):
            filename = filename.replace('.edus', '')
            annot = read_annotation(filename)
            edus = read_edus(filename)
            last_edu = 0

            for i, chunk in enumerate(split_by_paragraphs(  # self,
                    annot['text'],
                    annot['tokens'],
                    annot['lemma'],
                    annot['morph'],
                    annot['postag'],
                    annot['syntax_dep_tree'])):

                sentence = 0
                token = 0
                chunk['text'] = annot['text']
                tags = annot2tags(chunk, edus[last_edu:])
                
                for string in converter(filename.replace('data/', ''), chunk):                    
                    if string.startswith('# newdoc id ='):
                        sentence = 0
                        token = 0
                        fo.write(string + '\n')

                    elif string == '\n':
                        fo.write(string)
                        sentence += 1
                        token = 0

                    else:
                        if ' ' in string:
                            string = re.sub(r' .*\t', '\t', string)
                        if 'www' in string:
                            string = re.sub(r'www[^\t]*', '_html_', string)
                        if 'http' in string:
                            string = re.sub(r'http[^ \t]*', '_html_', string)
                            
                        string = prepare_token(string)                        
                        fo.write(string + '\t' + tags[sentence][token] + '\n')
                        
                        if tags[sentence][token] != '_':
                            last_edu += 1
                        
                        token += 1

                    if token == MAX_LEN:
                        print(filename + ' ::: occured very long sentence; truncate to ' + str(MAX_LEN) + ' tokens.')
                        fo.write('\n')
                        sentence += 1
                        token = 0
                        break


preprocess(train, subset='train')
preprocess(dev, subset='dev')
preprocess(test, subset='test')

As we will use Conll2003 dataset reader, it will expect this particular format instead of Conll-U we have now.

In [None]:
for filename in (TRAIN_FILE, DEV_FILE, TEST_FILE):
    contents = []
    with open(filename, 'r') as f:
        for string in f:
            if string.startswith('# newdoc id ='):
                pass
                #contents.append('-DOCSTART-\t-X-\t-X-\tO')
            
            elif string == '\n':
                contents.append(string)
                
            else:
                tokid, form, lemma, upos, xpos, feats, head, deprel, deps, segmenttag = string.split('\t')
                segmenttag = 'B-S' if segmenttag.strip() == 'BeginSeg=Yes' else 'O'
                contents.append('\t'.join([form, 'O', 'O', segmenttag]))
                
    outname = filename.replace('.conll', '.conll2003')
    with open(outname, 'w') as f:
        f.write('\n'.join(contents))

### Configs 1-5 (Outdated) 

1. Baseline model (BERT-M)

In [None]:
%%writefile models/segmenter/bertM.jsonnet


// Configuration for a named entity recognization model based on:
//   Peters, Matthew E. et al. “Deep contextualized word representations.” NAACL-HLT (2018).
{
  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      "bert": {
          "type": "bert-pretrained",
          "pretrained_model": std.extVar("BERT_VOCAB"),
          "do_lowercase": false,
          "use_starting_offsets": true
      },
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      }
    }
  },
  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
  "validation_data_path": std.extVar("TEST_A_PATH"),
  "model": {
    "type": "simple_tagger",
    "text_field_embedder": {
        "allow_unmatched_keys": true,
        "embedder_to_indexer_map": {
            "bert": ["bert", "bert-offsets"],
            "token_characters": ["token_characters"],
        },
        "token_embedders": {
            "bert": {
                "type": "bert-pretrained",
                "pretrained_model": std.extVar("BERT_WEIGHTS")
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 16
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 16,
                    "num_filters": 128,
                    "ngram_filter_sizes": [3],
                    "conv_layer_activation": "relu"
                }
            }
        }
    },
    "encoder": {
        "type": "lstm",
        "input_size": 768 + 128,
        "hidden_size": 100,
        "num_layers": 1,
        "dropout": 0.5,
        "bidirectional": true
    },
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
    "optimizer": {
        "type": "bert_adam",
        "lr": 0.001
    },
    "num_serialized_models_to_keep": 3,
    "num_epochs": 10,
    "grad_norm": 5.0,
    "patience": 3,
    "cuda_device": 1
  }
}


2. CRF model (BERT-M)

In [None]:
%%writefile models/segmenter/bertM_crf.jsonnet

// Configuration for a named entity recognization model based on:
//   Peters, Matthew E. et al. “Deep contextualized word representations.” NAACL-HLT (2018).
{
  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      "bert": {
          "type": "bert-pretrained",
          "pretrained_model": "bert-base-multilingual-cased",
          "do_lowercase": false,
          "use_starting_offsets": true
      },
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      }
    }
  },
  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
  "validation_data_path": std.extVar("TEST_A_PATH"),
  "model": {
    "type": "crf_tagger",
    "dropout": 0.2,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
        "allow_unmatched_keys": true,
        "embedder_to_indexer_map": {
            "bert": ["bert", "bert-offsets"],
            "token_characters": ["token_characters"],
        },
        "token_embedders": {
            "bert": {
                "type": "bert-pretrained",
                "pretrained_model": "bert-base-multilingual-cased",
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 16
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 16,
                    "num_filters": 128,
                    "ngram_filter_sizes": [3],
                    "conv_layer_activation": "relu",
                },
                "dropout": 0.2,
            },
        }
    },
    "encoder": {
        "type": "lstm",
        "input_size": 768 + 128,
        "hidden_size": 100,
        "num_layers": 1,
        "dropout": 0.5,
        "bidirectional": true
    },
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
    "optimizer": {
        "type": "bert_adam",
        "lr": 0.001
    },
    "validation_metric": "+f1-measure-overall",
    "num_serialized_models_to_keep": 3,
    "num_epochs": 10,
    "grad_norm": 5.0,
    "patience": 3,
    "cuda_device": 1
  }
}


2. CRF model (ELMo)

ELMo embedder: Place ``model.hdf5`` and ``options.json`` files from ``http://vectors.nlpl.eu/repository/20/195.zip`` in ``models/rsv_elmo/`` folder.

In [None]:
%%writefile models/segmenter/elmo.jsonnet

// Configuration for the NER model with ELMo, modified slightly from
// the version included in "Deep Contextualized Word Representations",
// taken from AllenNLP examples
// modified for the disrpt discourse segmentation shared task -- 2019 
{
  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
  "validation_data_path": std.extVar("TEST_A_PATH"),
  "model": {
    "type": "crf_tagger",
    "dropout": 0.2,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
      "token_embedders": {
        "elmo":{
            "type": "elmo_token_embedder",
            "options_file": "rsv_elmo/options.json",
            "weight_file": "rsv_elmo/model.hdf5",
            "do_layer_norm": false,
            "dropout": 0.0
        },
        "token_characters": {
            "type": "character_encoding",
            "embedding": {
                "embedding_dim": 16
            },
            "encoder": {
                "type": "cnn",
                "embedding_dim": 16,
                "num_filters": 128,
                "ngram_filter_sizes": [3],
                "conv_layer_activation": "relu"
            },
            "dropout": 0.2
        }
      }
    },
    "encoder": {
      "type": "lstm",
      "input_size": 1024+128,
      "hidden_size": 100,
      "num_layers": 1,
      "dropout": 0.5,
      "bidirectional": true
    },
    "regularizer": [
      [
        "scalar_parameters",
        {
          "type": "l2",
          "alpha": 0.01,
        }
      ]
    ]
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
    "optimizer": {
        "type": "adam",
        "lr": 0.001
    },
    "validation_metric": "+f1-measure-overall",
    "num_serialized_models_to_keep": 3,
    "num_epochs": 10,
    "grad_norm": 5.0,
    "patience": 3,
    "cuda_device": 1
  }
}

3. CRF model (ELMo+fastText)

fastText embedder: place ``http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec`` in ``models/``

In [None]:
%%writefile models/segmenter/elmo_ft.jsonnet

// Configuration for the NER model with ELMo, modified slightly from
// the version included in "Deep Contextualized Word Representations",
// taken from AllenNLP examples
// modified for the disrpt discourse segmentation shared task -- 2019 
{

  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      "tokens": {
        "type": "single_id",
        "lowercase_tokens": false
      },
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
  "validation_data_path": std.extVar("TEST_A_PATH"),
  "model": {
    "type": "crf_tagger",
    "dropout": 0.2,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
      "token_embedders": {
        "tokens": {
            "type": "embedding",
            "embedding_dim": 300,
            "pretrained_file": "ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec",
            "trainable": false
        },
        "elmo":{
            "type": "elmo_token_embedder",
            "options_file": "rsv_elmo/options.json",
            "weight_file": "rsv_elmo/model.hdf5",
            "do_layer_norm": false,
            "dropout": 0.0
        },
        "token_characters": {
            "type": "character_encoding",
            "embedding": {
                "embedding_dim": 16
            },
            "encoder": {
                "type": "cnn",
                "embedding_dim": 16,
                "num_filters": 128,
                "ngram_filter_sizes": [3],
                "conv_layer_activation": "relu"
            },
            "dropout": 0.25
        }
      }
    },
    "encoder": {
      "type": "lstm",
      "input_size": 1024+128+300,
      "hidden_size": 100,
      "num_layers": 2,
      "dropout": 0.5,
      "bidirectional": true
    },
    "regularizer": [
      [
        "scalar_parameters",
        {
          "type": "l2",
          "alpha": 0.01,
        }
      ]
    ]
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
    "optimizer": {
        "type": "adam",
        "lr": 0.001
    },
    "validation_metric": "+f1-measure-overall",
    "num_serialized_models_to_keep": 3,
    "num_epochs": 10,
    "grad_norm": 5.0,
    "patience": 3,
    "cuda_device": 1
  }
}

4. CRF model (ELMo + RuBERT)

RuBERT embedder: unpack ``http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz`` in ``models/``

In [None]:
%%writefile models/segmenter/elmo_rubert.jsonnet

// Configuration for the NER model with ELMo and RuBERT
{

  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      //"tokens": {
      //  "type": "single_id",
      //  "lowercase_tokens": true
      //},
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters"
     },
      "bert": {
          "type": "bert-pretrained",
          "pretrained_model": std.extVar("BERT_VOCAB"),
          "do_lowercase": false,
          "use_starting_offsets": true
      },
    }
  },
  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
  "validation_data_path": std.extVar("TEST_A_PATH"),
  "model": {
    "type": "crf_tagger",
    "dropout": 0.2,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
        "allow_unmatched_keys": true,
        "embedder_to_indexer_map": {
            "bert": ["bert", "bert-offsets"],
            "token_characters": ["token_characters"],
            "elmo": ["elmo"],
            "tokens": ["tokens"],
        },
      "token_embedders": {
        //"tokens": {
        //    "type": "embedding",
        //    "embedding_dim": 300,
        //    "pretrained_file": "ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec",
        //    "trainable": true
        //},
        "elmo":{
            "type": "elmo_token_embedder",
            "options_file": "rsv_elmo/options.json",
            "weight_file": "rsv_elmo/model.hdf5",
            "do_layer_norm": false,
            "dropout": 0.0
        },
        "bert": {
                "type": "bert-pretrained",
                "pretrained_model": std.extVar("BERT_WEIGHTS"),
                "requires_grad": true,
                "top_layer_only": false
            },
        "token_characters": {
            "type": "character_encoding",
            "embedding": {
                "embedding_dim": 16
            },
            "encoder": {
                "type": "cnn",
                "embedding_dim": 16,
                "num_filters": 128,
                "ngram_filter_sizes": [3],
                "conv_layer_activation": "relu"
            },
            "dropout": 0.2
        }
      }
    },
    "encoder": {
      "type": "lstm",
      "input_size": 1024+128+768,
      "hidden_size": 200,
      "num_layers": 2,
      "dropout": 0.5,
      "bidirectional": true
    },
    "regularizer": [
            [
                "scalar_parameters",
                {
                    "alpha": 0.01,
                    "type": "l2"
                }
            ]
    ]
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
        "optimizer": {
            "type": "bert_adam",
            "lr": 0.001
        },
    "validation_metric": "+f1-measure-overall",
    "num_serialized_models_to_keep": 3,
    "num_epochs": 10,
    "grad_norm": 5.0,
    "patience": 2,
    "cuda_device": 1
  }
}

5. RuBERT

In [None]:
%%writefile models/segmenter/rubert.jsonnet

// Configuration for the NER model with ELMo, modified slightly from
// the version included in "Deep Contextualized Word Representations",
// taken from AllenNLP examples
// modified for the disrpt discourse segmentation shared task -- 2019 
{

  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      //"tokens": {
      //  "type": "single_id",
      //  "lowercase_tokens": true
      //},
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
#       "elmo": {
#         "type": "elmo_characters"
#      },
      "bert": {
          "type": "bert-pretrained",
          "pretrained_model": std.extVar("BERT_VOCAB"),
          "do_lowercase": false,
          "use_starting_offsets": true
      },
    }
  },
  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
  "validation_data_path": std.extVar("TEST_A_PATH"),
  "model": {
    "type": "crf_tagger",
    "dropout": 0.2,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
        "allow_unmatched_keys": true,
        "embedder_to_indexer_map": {
            "bert": ["bert", "bert-offsets"],
            "token_characters": ["token_characters"],
            "elmo": ["elmo"],
            "tokens": ["tokens"],
        },
      "token_embedders": {
        //"tokens": {
        //    "type": "embedding",
        //    "embedding_dim": 300,
        //    "pretrained_file": "ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec",
        //    "trainable": true
        //},
#         "elmo":{
#             "type": "elmo_token_embedder",
#             "options_file": "rsv_elmo/options.json",
#             "weight_file": "rsv_elmo/model.hdf5",
#             "do_layer_norm": false,
#             "dropout": 0.0
#         },
        "bert": {
                "type": "bert-pretrained",
                "pretrained_model": std.extVar("BERT_WEIGHTS"),
                "requires_grad": true,
                "top_layer_only": false
            },
        "token_characters": {
            "type": "character_encoding",
            "embedding": {
                "embedding_dim": 16
            },
            "encoder": {
                "type": "cnn",
                "embedding_dim": 16,
                "num_filters": 128,
                "ngram_filter_sizes": [3],
                "conv_layer_activation": "relu"
            },
            "dropout": 0.2
        }
      }
    },
    "encoder": {
      "type": "lstm",
      "input_size": 128+768,
      "hidden_size": 100,
      "num_layers": 1,
      "dropout": 0.5,
      "bidirectional": true
    },
    "regularizer": [
            [
                "scalar_parameters",
                {
                    "alpha": 0.01,
                    "type": "l2"
                }
            ]
    ]
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
        "optimizer": {
            "type": "bert_adam",
            "lr": 0.001
        },
    "validation_metric": "+f1-measure-overall",
    "num_serialized_models_to_keep": 3,
    "num_epochs": 10,
    "grad_norm": 5.0,
    "patience": 3,
    "cuda_device": 1
  }
}

### Configs 6-... (Brand new) 
Allennlp v2.9.3, etc.

In [None]:
%%writefile models/segmenter/rubert.jsonnet


local BERT_MODEL = "DeepPavlov/rubert-base-cased";
local TRAIN_FILE = "segmenter/rus.rst.rrt_train.conll2003";
local DEV_FILE = "segmenter/rus.rst.rrt_dev.conll2003";

local CNN_CHAR_HIDDEN = std.parseJson(std.extVar('CNN_CHAR_HIDDEN'));  // 128
local LSTM_HIDDEN = std.parseJson(std.extVar('LSTM_HIDDEN'));  // 100
local LR = std.parseJson(std.extVar('LR'));  // 100

local NUM_EPOCHS = 30;

{

  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 30
      },
      "bert": {
          "type": "pretrained_transformer_mismatched",
          "model_name": BERT_MODEL,
          "max_length": 512,
      },
    }
  },
  "train_data_path": TRAIN_FILE,
  "validation_data_path": DEV_FILE,
  "model": {
    "type": "crf_tagger",
    "dropout": 0.2,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
      "token_embedders": {
        "bert": {
                "type": "pretrained_transformer_mismatched",
                "model_name": BERT_MODEL,
                "train_parameters": true,
                "last_layer_only": false
            },
        "token_characters": {
            "type": "character_encoding",
            "embedding": {
                "embedding_dim": 16,
                "sparse": false,
                "vocab_namespace": "token_characters"
            },
            "encoder": {
                "type": "gru",
                "input_size": $.model.text_field_embedder.token_embedders.token_characters.embedding.embedding_dim,
                "hidden_size": CNN_CHAR_HIDDEN,
                "num_layers": 1,
                "bidirectional": true,
            },
            "dropout": 0.2
        }
      }
    },
    "encoder": {
        "type": "pass_through",
        "input_dim": 768+CNN_CHAR_HIDDEN*2,
    },
  },
  "data_loader": {
    "batch_sampler": {
        "type": "bucket",
        "batch_size": 8,
        "padding_noise": 0.0,
    },
  },
  "trainer": {
    "num_epochs": NUM_EPOCHS,
    "patience": 3,
    "grad_norm": 5.0,
    "validation_metric": "+f1-measure-overall",
    "cuda_device": 1,
    "optimizer": {
      "type": "adamw",
      "lr": LR,
      "parameter_groups": [[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}]],
    },
  }
}

In [None]:
%%writefile models/segmenter/elmo_ft.jsonnet


local BERT_MODEL = "DeepPavlov/rubert-base-cased";
local TRAIN_FILE = "segmenter/rus.rst.rrt_train.conll2003";
local DEV_FILE = "segmenter/rus.rst.rrt_dev.conll2003";

local LSTM_HIDDEN = std.parseJson(std.extVar('LSTM_HIDDEN'));  // 100
local LR = std.parseJson(std.extVar('LR'));  // 0.001

local NUM_EPOCHS = 30;

{

  "dataset_reader": {
    "type": "conll2003",
    "tag_label": "ner",
    "coding_scheme": "BIOUL",
    "token_indexers": {
      "tokens": {
        "type": "single_id",
        "lowercase_tokens": true,
      },
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters"
     },
    }
  },
  "train_data_path": TRAIN_FILE,
  "validation_data_path": DEV_FILE,
  "model": {
    "type": "crf_tagger",
    "dropout": 0.25,
    "calculate_span_f1": true,
    "label_encoding": "BIOUL",
    "text_field_embedder": {
      "token_embedders": {
        "tokens": {
            "type": "embedding",
            "embedding_dim": 300,
            "pretrained_file": "ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec",
            "trainable": false
        },
        "elmo":{
            "type": "elmo_token_embedder",
            "options_file": "rsv_elmo/options.json",
            "weight_file": "rsv_elmo/model.hdf5",
            "do_layer_norm": false,
            "dropout": 0.0
        },
        "token_characters": {
            "type": "character_encoding",
            "dropout": 0.25,
            "embedding": {
                "embedding_dim": 16
            },
            "encoder": {
                "type": "cnn",
                "conv_layer_activation": "relu",
                "embedding_dim": 16,
                "ngram_filter_sizes": [
                    3
                ],
                "num_filters": 64
            }
        },
      }
    },
    "encoder": {
        "type": "lstm",
        "input_size": 1024+$.model.text_field_embedder.token_embedders.tokens.embedding_dim+$.model.text_field_embedder.token_embedders.token_characters.encoder.num_filters,
        "hidden_size": LSTM_HIDDEN,
        "num_layers": 2,
        "dropout": 0.25,
        "bidirectional": true
    },
    "regularizer": {
        "regexes": [
            [
                "scalar_parameters",
                {
                    "alpha": 0.001,
                    "type": "l2"
                }
            ]
        ]
    },
  },
  "data_loader": {
    "batch_sampler": {
        "type": "bucket",
        "batch_size": 8,
        "padding_noise": 0.0,
    },
  },
  "trainer": {
    "num_epochs": NUM_EPOCHS,
    "patience": 3,
    "grad_norm": 5.0,
    "num_serialized_models_to_keep": 1,
    "validation_metric": "+f1-measure-overall",
    "cuda_device": 1,
    "optimizer": {
      "type": "adamw",
      "lr": LR,
    },
  }
}

In [None]:
%%writefile models/segmenter_params.json

[
  {
    "type": "int",
    "attributes": {
      "name": "LSTM_HIDDEN",
      "low": 64,
      "high": 256
    }
  },
  {
    "type": "float",
    "attributes": {
      "name": "LR",
      "low": 2e-4,
      "high": 2e-2,
      "log": true
    }
  }
]

In [None]:
%%writefile models/tune_segmenter.sh

export METHOD=elmo_ft
export STUDY_NAME=segmenter_tuning_2
mkdir optuna
rm -r optuna/segmenter_$METHOD
mkdir optuna/segmenter_$METHOD

# optuna delete-study --study-name $STUDY_NAME
allennlp tune segmenter/${METHOD}.jsonnet segmenter_params.json --serialization-dir optuna/segmenter_$METHOD \
    --study-name $STUDY_NAME \
    --skip-if-exists \
    --metrics best_validation_f1-measure-overall \
    --direction maximize

In [None]:
def collect_optuna_results(path):
    for trial in glob(os.path.join(path, 'trial_*/')):
        try:
            metrics = json.load(open(os.path.join(trial, 'metrics.json')))
            print(trial, metrics['best_validation_f1-measure-overall'])
        except:
            pass
        
collect_optuna_results('models/optuna/segmenter_elmo_ft/')

In [None]:
pd.Series(sorted([0.8663200891198907, # Trial 5
                  0.8804893636446136, # Trial 6
                  0.8804893636446136, # Trial 7
                  0.8928604329801436, # Trial 8 [x]
                  0.8899245036815606, # Trial 9
                  0.8914864242981554, # Trial 10
                  0.8900630329995792, # Trial 11
                  0.8923220102372698, # Trial 12
                  0.8894610998712026, # Trial 13
                 ])).plot(kind='density', bw_method=0.3, title='f1')

In [None]:
! cd models && allennlp best-params --study-name segmenter_tuning_2

In [None]:
best_trial = 'trial_8'
! rm -r models/segmenter/elmo_ft/
! mv models/optuna/segmenter_elmo_ft/$best_trial models/segmenter/elmo_ft

In [None]:
! ls models/segmenter/

### Evaluation 

In [None]:
%%writefile models/eval_segmenter.sh
# usage:
# $ cd models 
# $ sh eval_segmenter.sh {bert|elmo_ft}

export RESULT_DIR=${1}
export DEV_FILE_PATH="rus.rst.rrt_dev.conll2003"
export TEST_FILE_PATH="rus.rst.rrt_test.conll2003"

allennlp predict --use-dataset-reader --cuda-device 0 --silent \
    --output-file segmenter/${RESULT_DIR}/predictions_dev.json segmenter/${RESULT_DIR}/model.tar.gz segmenter/${DEV_FILE_PATH}
allennlp predict --use-dataset-reader --cuda-device 0 --silent \
    --output-file segmenter/${RESULT_DIR}/predictions_test.json segmenter/${RESULT_DIR}/model.tar.gz segmenter/${TEST_FILE_PATH}

In [None]:
import torch
from torch.nn.functional import softmax
import json


def load_predictions(path, threshold=None):
    result = []
    
    if threshold:
        with open(path, 'r') as file:
            for line in file:
                result += json.loads(line)['logits']

        probs = softmax(torch.tensor(result), dim=-1)
        result = probs[:,1] > threshold
        return ['B-S' if tag else 'O' for tag in result]
    else:
        with open(path, 'r') as file:
            for line in file:
                result += json.loads(line)['tags']
        return [tag if tag=='O' else 'B-S' for tag in result]

RESULT_DIR = 'elmo_ft'
DEV_FILE_PATH="models/segmenter/rus.rst.rrt_dev.conll2003"
TEST_FILE_PATH="models/segmenter/rus.rst.rrt_test.conll2003"

In [None]:
from sklearn.metrics import classification_report, f1_score, recall_score
import numpy as np

#### On dev 

In [None]:
true = [line.strip().split()[-1] for line in open(DEV_FILE_PATH, 'r').readlines() if line.strip()]
best_f1 = 0.
best_threshold = 1.
for threshold in tqdm(np.arange(1., 0.6, -0.01)):
    pred = load_predictions(f'models/segmenter/{RESULT_DIR}/predictions_dev.json', threshold=threshold)
    f1 = f1_score(true, pred, pos_label='B-S')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold
print(best_f1, ':::', best_threshold)

In [None]:
best_recall = 0.
best_threshold = 1.
for threshold in tqdm(np.arange(1., 0.6, -0.01)):
    pred = load_predictions(f'models/segmenter/{RESULT_DIR}/predictions_dev.json', threshold=threshold)
    recall = recall_score(true, pred, pos_label='B-S')
    if recall > best_recall:
        best_recall = recall
        best_threshold = threshold
print(best_recall, ':::', best_threshold)

In [None]:
! ls ../../models/segmenter/

In [None]:
pred = load_predictions(f'models/segmenter/{RESULT_DIR}/predictions_dev.json')

print(classification_report(true[:len(pred)], pred, digits=4))

#### On test

In [None]:
true = [line.strip().split()[-1] for line in open(TEST_FILE_PATH, 'r').readlines() if line.strip()]
pred = load_predictions(f'models/segmenter/{RESULT_DIR}/predictions_test.json')

print('length of true:', len(true))
print('length of pred:', len(pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred, digits=4))

### Playground 

In [None]:
from allennlp.predictors import Predictor

predictor = Predictor.from_path('models/segmenter/elmo_ft/model.tar.gz', cuda_device=1)

In [None]:
sentence = "Тогда пришлите мне тексты . Я соберу парсер с уже дообученными сегментацией и классификацией отношений и размечу предложения до конца недели"
tags = predictor.predict(sentence)["tags"]
tuple(zip(sentence.split(), tags))

In [None]:
from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

In [None]:
predictor._tokenizer = WhitespaceTokenizer()

In [None]:
predictor._to_params()

In [None]:
sentence = "Тогда пришлите мне тексты . Я соберу парсер с уже дообученными сегментацией и классификацией отношений и размечу предложения до конца недели"
tags = predictor.predict(sentence)["tags"]
tuple(zip(sentence.split(), tags))