In [1]:
PARAMS = {
    "name" : "default",

    "n_runs"      : 1,
    "n_units"     : [128],
    "cfg_files"   : ["sql_full_XL.pcfg"],
    "n_tup_test"  : [128],

    "n_epochs"    : 50,
    "n_layers"    : [1],
    "batch_size"  : 128,
    "window_size" : 30,
    "val_split"   : 0.1,
    "patience"    : 3,
    "max_seq_size": 350,
    "n_seq_train" : 128,
    "n_seq_test"  : 100,
    "test_step"   : 5,
    "T_max"       : 1800,
    "DNI" : [
        {
            "version": "vanilla",
            "metric": "log_regression"
        }
    ]
}

In [2]:
import warnings
import copy
import collections
import time, timeit
import random

import numpy as np
import nltk
from nltk import Tree
from nltk import PCFG,CFG
from nltk.grammar import is_terminal, is_nonterminal, ProbabilisticProduction, Nonterminal
from nltk.probability import DictionaryProbDist

In [135]:
FEAT_NAME_PREFIX='r_'

def check_rules(ruleset):
    pass
class Grammar:

    def __init__(self, pcfg_string=None, cfg_string=None,
                       pcfg_file = None, cfg_file = None, sep_char=''):
        # Reads files
        if pcfg_file is not None:
            with open(pcfg_file, 'r') as f:
                print('Loading', pcfg_file)
                pcfg_string = f.read()
        if cfg_file is not None:
            with open(cfg_file, 'r') as f:
                print('Loading', cfg_file)
                cfg_string = f.read()
        self.sep_char = sep_char
        self.pcfg_string = pcfg_string
        self.cfg_string  = cfg_string
        self.build_grammar()
        self.wrap_terminals()
        self.clean_grammar()
        self.build_parser()
        self.sort_non_term_nodes()

    def wrap_terminals(self):
        # Gets number of lhs per non terminal symbols
        ix = self.rule_index()
        len_ix = {lhs:len(rules) for lhs,rules in ix.items()}

        counter = 0
        term_index = {}
        new_rules = []

        for rule in self.grammar.productions():

            # If Nonterm := Term or Nontern := Nonterm rule, skip
            if len_ix[rule.lhs()]==1 and len(rule.rhs())==1:
                new_rules.append(rule)
                continue

            # Otherwise creates a new rule
            new_rhs = []
            for r in rule.rhs():
                if is_nonterminal(r) :
                    new_rhs.append(r)
                else:
                    if r not in term_index:
                        new_left = Nonterminal('symb_'+str(r))
                        prule = ProbabilisticProduction(new_left,
                                                        [r],
                                                        prob=1.0)
                        term_index[r]=prule
                    new_rhs.append(new_left)

            new_rule = ProbabilisticProduction(rule.lhs(),
                                               new_rhs,
                                               prob=rule.prob())
            new_rules.append(new_rule)

        new_rules += term_index.values()

        self.grammar = PCFG(self.grammar.start(), new_rules)

    def rule_index(self):
        rules = [r for r in self.grammar.productions() if r.prob() > 0]
        rule_index = collections.defaultdict(list)
        for r in rules:
            rule_index[r.lhs()].append(r)
        return rule_index

    def sort_non_term_nodes(self):
        # Creates rules index
        rule_index = self.rule_index()

        # Gets the symbols BFS
        visited = [self.grammar.start()]
        Q = collections.deque()
        Q.append(self.grammar.start())

        while len(Q) > 0:
            # Gets first symbol in queue
            symb = Q.popleft()
            # Fetches corresponding rules
            rules = rule_index[symb]
            for r in rules:
                for symb in r.rhs():
                    if is_nonterminal(symb) and not symb in visited:
                        Q.append(symb)
                        visited.append(symb)

        visited = list(reversed(visited))

        # Alternative method for checking correctness
        alt_visited = [S for S in self.symbols() if is_nonterminal(S)]
        assert set(visited) == set(alt_visited)

        self.sorted_non_term_nodes = visited

    def build_grammar(self):
        # Loads rules
        if self.pcfg_string is not None:
            self.grammar = PCFG.fromstring(self.pcfg_string)
        elif self.cfg_string is not None:
            self.load_cfg()
        n_rules = len(self.grammar.productions())
        print('Loaded grammar with', n_rules, 'rules')

    def clean_grammar(self):
        # Removes 0-proba rules
        print('Grammar rules before removing zero-rules:', \
                len(self.grammar.productions()))
        old_grammar = self.grammar

        # Creates rules index
        rule_index = self.rule_index()

        # BFS
        arules = set()
        visited = set()
        Q = collections.deque()
        Q.append(self.grammar.start())

        while len(Q) > 0:
            # Gets first symbol in queue
            symb = Q.popleft()
            # Fetches corresponding rules
            rules = rule_index[symb]
            for r in rules:
                arules.add(r)
                for symb in r.rhs():
                    if is_nonterminal(symb) and not symb in visited:
                        Q.append(symb)
                        visited.add(symb)

        # Creates new rules
        arules = list(arules)
        check_rules(arules)
        self.grammar = PCFG(old_grammar.start(), arules)
        print('Grammar rules after removing zero-rules:', \
                len(self.grammar.productions()))

    def n_rules(self):
        return len(self.grammar.productions())

    def n_features(self):
        return len(self.non_term_nodes())*2

    def build_parser(self):
        # Creates parser
        self.parser = nltk.EarleyChartParser(self.grammar)
        print ('Created parser')

    def load_cfg(self):
        # Creates deterministic grammar
        cfg_gram = CFG.fromstring(self.cfg_string)

        # Groups by LHS
        ggroups = collections.defaultdict(list)
        for rule in cfg_gram.productions():
            ggroups[rule.lhs()].append(rule)

        # For each group computes probabilities
        prules = []
        for lhs, ruleset in ggroups.iteritems():
            N = len(ruleset)
            assert N > 0
            for rule in ruleset:
                p = 1.0 / N
                prule = ProbabilisticProduction(rule.lhs(),
                                                rule.rhs(),
                                                prob=p)
                prules.append(prule)

        # Asserts that all probas add up to 1
        check_rules(prules)

        # Returns new grammar
        self.grammar = PCFG(cfg_gram.start(), prules)

    def symbols(self):
        symbols = set()
        for rule in self.grammar.productions():
            symbols.add(rule.lhs())
            for r in rule.rhs():
                symbols.add(r)
        return symbols

    def term_nodes(self):
        return [S for S in self.symbols() if is_terminal(S)]

    def non_term_nodes(self):
        return self.sorted_non_term_nodes

    def dump_to_file(self, file):
        '''
            Trims grammar to max N rules
            The RHS of the deleted tules will be transformed into terminals
        '''
        # In case already trimmed, reinstaures old grammar

        # Walks the grammar BFS style until found N nodes
        Q = collections.deque()
        Q.append(self.grammar.start())
        rules_visited = []
        explored_rhs = set()

        while len(Q) > 0:

            # Gets first symbol in queue
            symb = Q.popleft()
            # Fetches corresponding rules
            rules = self.grammar.productions(lhs=symb)

            for rule in rules:

                # Adds rule to collection
                if not rule in rules_visited:
                    rules_visited.append(rule)

                # Adds right-hand symbols to queue
                for rsymb in rule.rhs():
                    if is_nonterminal(rsymb) and not rsymb in explored_rhs:
                        Q.append(rsymb)
                        explored_rhs.add(symb)

        with open(file, 'w+') as f:
            for rule in rules_visited:
                f.write(str(rule) +'\n')

    def pruning_dag(self):
        name_to_ix_1 = {n:2*i for i,n in enumerate(self.non_term_nodes())}
        name_to_ix_2 = {n:2*i+1 for i,n in enumerate(self.non_term_nodes())}
        to_feat_1 = lambda n: name_to_ix_1[n]
        to_feat_2 = lambda n: name_to_ix_2[n]
        return pruning.FeatureDAG(self.grammar,
            [to_feat_1, to_feat_2], is_nonterminal)

        # # Materializes the predcedence graph between rules
        # tree = collections.defaultdict(set)
        # for r in self.grammar.productions():
        #     lf = to_node(r.lhs())
        #     for rhs in r.rhs():
        #         if is_nonterminal(rhs):
        #             rf = to_node(rhs)
        #             tree[lf].add(rf)

        # first_node = to_node(self.grammar.start)

        # return tree, first_node

    ############
    # Trimming #
    ############
    def freeze_symbol(self, node):
        symb = node.symbol()
        if self.sep_char is not None and self.sep_char != '':
            return [symb]
        else:
            return [c for c in symb] + [' ']

    def trim(self, N):
        '''
            Trims grammar to max N rules
            The RHS of the deleted tules will be transformed into terminals
        '''
        # In case already trimmed, reinstaures old grammar
        self.build_grammar()

        # Walks the grammar BFS style until found N nodes
        Q = collections.deque()
        Q.append(self.grammar.start())
        n = 0
        rules_to_keep = []
        explored_rhs = set()

        while n < N and len(Q) > 0:
            # Gets first symbol in queue
            symb = Q.popleft()
            # Fetches corresponding rules
            rules = self.grammar.productions(lhs=symb)
            for rule in rules:
                # Makes sure that there's space left
                if n >= N:
                    break
                # Adds rule to collection
                if not rule in rules_to_keep:
                    rules_to_keep.append(rule)
                    n += 1
                # Adds right-hand symbols to queue
                for rsymb in rule.rhs():
                    if is_nonterminal(rsymb) and not rsymb in explored_rhs:
                        Q.append(rsymb)
                # Marks symb as explored
                explored_rhs.add(symb)

        # Casts all the unresolved rules to terminals
        for i,rule in enumerate(rules_to_keep):
            new_rhs = []
            for symb in rule.rhs():
                if is_nonterminal(symb) and not symb in explored_rhs:
                    symb = self.freeze_symbol(symb)
                else:
                    symb = [symb]
                new_rhs.extend(symb)
            new_rule = ProbabilisticProduction(rule.lhs(),
                                               new_rhs,
                                               prob=rule.prob())
            rules_to_keep[i] = new_rule

        # Groups all rules that have the lhs
        # This will be useful for everything that follows
        rules_index = collections.defaultdict(list)
        for i,rule in enumerate(rules_to_keep):
            rules_index[rule.lhs()].append((i,rule))

        # Removes infinite loops
        for lhs, rules in rules_index.iteritems():

            # Checks if the rules contains ONLY recursive rules
            only_recurs = True
            i_last = None
            for ix,rule in rules:
                is_recurs = any(s==lhs for s in rule.rhs())
                only_recurs &= is_recurs
                if is_recurs: i_last = ix

            # If so, removes the recursivity in the last rule
            if only_recurs:
                last_r = rules_to_keep[i_last]
                new_rhs = []
                for symb in last_r.rhs():
                    symb = self.freeze_symbol(symb) if symb==lhs else [symb]
                    new_rhs.extend(symb)
                rules_to_keep[i_last] = ProbabilisticProduction(
                        last_r.lhs(), new_rhs, prob=last_r.prob())

        # Rebuilds index
        rules_index = collections.defaultdict(list)
        for i,rule in enumerate(rules_to_keep):
            rules_index[rule.lhs()].append((i,rule))

        # Corrects the probabilites so that they all sum up to 1
        for lhs, rules in rules_index.iteritems():
            for r in rules:
                tot_prob = sum(r.prob() for i,r in rules)
                if abs(tot_prob - 1.0) > PCFG.EPSILON:
                    for i,r in rules:
                        rules_to_keep[i] = ProbabilisticProduction(
                            r.lhs(), r.rhs(), prob=r.prob() / tot_prob)

        # Replaces the grammar
        check_rules(rules_to_keep)
        self.grammar = PCFG(self.grammar.start(), rules_to_keep)
        print('Trimmed grammar to', len(self.grammar.productions()), 'rules')
        self.build_parser()

    ####################################
    # Sequence generation and sampling #
    ####################################
    def sample_tree(self):
        global N_RECURS
        N_RECURS = 0

        def sample_from_lhs(symb):
            # Checks recursivity
            global N_RECURS
            N_RECURS += 1
            if N_RECURS > 5000:
                raise Exception('Too many recursions!')

            # Gets the prod rules with symb the left side
            all_rules = rule_index[symb]

            # Samples a rule
            distrib = {r:r.prob() for r in all_rules}
            if abs(sum(distrib[k] for k in distrib) - 1.0) > .01:
                print(distrib)
            rule = DictionaryProbDist(distrib).generate()

            # Appends to the tree
            tree = []
            for node in rule.rhs():
                if is_terminal(node):
                    tree.append(node)
                else:
                    subtree = sample_from_lhs(node)
                    tree.append(subtree)
            return tree

        S = self.grammar.start()
        rule_index = collections.defaultdict(list)
        for r in self.grammar.productions():
            rule_index[r.lhs()].append(r)
        return sample_from_lhs(S)

    def serialize_tree(self, tree):
        buf = []
        def consume(tree):
            if type(tree)==list:
                for subtree in tree:
                    consume(subtree)
            else:
                buf.append(tree)

        consume(tree)
        out = self.sep_char.join(buf)
        return out

    def generate_sequences(self, n_sequences, len_max=500,
                           padded_size=None, padding_char='~'):
        '''
            padded_size: None means no padding
                        -1 means padding to max sequence size
                        some integer n means padding to size n
                             (truncates if necessary)
        '''
        # Generates random sentences
        len_max = 10000000 if len_max is None else len_max
        out = []
        N = 0
        while len(out) < n_sequences:
            tree = self.sample_tree()
            seq = self.serialize_tree(tree)
            if len(seq) < len_max: out.append(seq)
            N += 1
            assert N < 10000000

        M = sum(len(s) for s in out)*1.0 / len(out)
        #print 'Avg sequence size:', M
        if padded_size is None:
            return out

        # Checks max size
        if 0 <= padded_size < M:
            warnings.warn('Padding size is smaller that longest sequence'
                          ' I will truncate!')
        if padded_size == -1:
            padded_size = M

        for i,s in enumerate(out):
            if len(s) > padded_size:
                s = s[:padded_size]
            else:
                s = s.rjust(padded_size, padding_char)
            out[i] = s

        return out

    #########################
    # Parsing-based Feature #
    #########################
    def tokenize(self, expr, skipchars=['~'], ignore_errors=False):
        # Non teriminals
        terminals = set(self.term_nodes())
        L = max(len(w) for w in terminals)
        def terminal(i, expr):
            for wid in reversed(range(1, L+1)):
                j = i + wid
                if j > len(expr):
                    continue
                w = expr[i:j]
                if w in terminals:
                    return w
            return None

        tokens = []
        index = []
        i = 0
        while i < len(expr):
            if not expr[i] in skipchars:
                term = terminal(i, expr)
                if term is None:
                    if not ignore_errors:
                        raise ValueError('Could not match token', expr[i:])
                    else:
                        return [], []
                tokens.append(term)
                index.append((i,len(term)))
                i += len(term)
            else:
                i += 1

        return tokens, index

    def parse(self, tokens):
        if len(tokens) == 0:
            return []
        parse = None
        for p in self.parser.parse(tokens):
            parse = p
            break
        return parse

    # Warning: returns word level features
    def parse_to_features(self, tree, binary=True):
        n_tokens = len(tree.leaves())
        nt_symbols = self.non_term_nodes()

        rule_feats = np.zeros((n_tokens, len(nt_symbols)))
        rule2feat = {s.symbol():i for i,s in enumerate(nt_symbols)}

        def visit(tree, offset):
            if isinstance(tree, Tree):
                n_tokens = 0
                for subtree in tree:
                    n_tokens += visit(subtree, offset + n_tokens)
                symb = tree.label()
                j = rule2feat[symb]
                if binary:
                    rule_feats[offset:offset+n_tokens, j] = 1
                else:
                    rule_feats[offset:offset+n_tokens, j] += 1
                return n_tokens
            else:
                return 1

        visit(tree, 0)
        return rule_feats

    def word_to_char_feat(self, seq, lex_index, w_feats):
        ch_len = len(seq)
        w_len  = w_feats.shape[0]
        n_feats = w_feats.shape[1]
        ch_feats = np.zeros((ch_len, n_feats))

        for i_w in range(w_len):
            f_w  = w_feats[i_w,...]
            i_ch,len_ch = lex_index[i_w]
            f_ch = np.tile(f_w, len_ch).reshape((len_ch, n_feats))
            ch_feats[i_ch:i_ch+len_ch,:] = f_ch

        return ch_feats

    def extract_parse_feats(self, seq, binary=True, verbose=False):
        # Tokenizes and parses
        start_t = timeit.default_timer()
        tokens, tok_index = self.tokenize(seq)
        parse_tree = self.parse(tokens)
        if verbose:
            print(len(tokens))
        # Converts to word-, then character-level features
        w_tree_features = self.parse_to_features(parse_tree)
        
        tree_features = self.word_to_char_feat(seq, tok_index, w_tree_features)
        stop_t = timeit.default_timer()
        runtime = stop_t - start_t
        return tree_features, runtime

    def detect_bounds(self, x):
        if len(x) < 2:
            return x
        x = np.int_(x)
        x2 = np.insert(x,0,0)
        start = x2[1:] - x2[:-1] > 0
        x2 = np.append(x,0)
        end = x2[:-1] - x2[1:] > 0
        return np.logical_or(start, end)


    def sub_parse_features(self, orig_sequences, prov_index, w_size,
                            include_impulses=True,
                            name_prefix=FEAT_NAME_PREFIX):

        # Actual features
        global PARSE_TIME
        PARSE_TIME = 0
        mem_cache = collections.OrderedDict()
        CACHE_LEN = float('inf')
        out = []
        symbols = self.non_term_nodes()
        for i_S, S in enumerate(symbols):

            # Creates the feature function
            # (default arg assignment prevent late binding)
            def feat1(seq, i_S=i_S, S=S.symbol()):

                # Retrieves mother sequence
                i_seq, i_start, i_end  = prov_index[seq]

                if i_seq in mem_cache:
                    # If sequence in cache, retrieves it
                    seq_feats = mem_cache[i_seq]

                else:
                    # otherwise compute tree ....
                    moth_seq = orig_sequences[i_seq]
                    seq_feats, runtime = self.extract_parse_feats(moth_seq)
                    global PARSE_TIME
                    PARSE_TIME += runtime

                    # ... and cache it
                    mem_cache[i_seq] = np.bool_(seq_feats)
                    if len(mem_cache) > CACHE_LEN:
                        cache.popitem(last=False)

                feat_vals = seq_feats[i_start:i_end,i_S]

                # If necesary, pads with zeroes
                L = i_end - i_start
                assert L == feat_vals.shape[0]
                if L < w_size:
                    tmp = np.zeros((w_size))
                    tmp[-L:] = feat_vals
                    feat_vals = tmp

                assert feat_vals.shape[0] == len(seq)
                feat_name = 'F_' + S
                return feat_vals, feat_name

            def feat2(seq, i_S=i_S, S=S.symbol()):

                # Retrieves mother sequence
                i_seq, i_start, i_end  = prov_index[seq]

                if i_seq in mem_cache:
                    # If sequence in cache, retrieves it
                    seq_feats = mem_cache[i_seq]

                else:
                    # otherwise compute tree ....
                    moth_seq = orig_sequences[i_seq]
                    seq_feats, runtime = self.extract_parse_feats(moth_seq)
                    global PARSE_TIME
                    PARSE_TIME += runtime

                    # ... and cache it
                    mem_cache[i_seq] = np.bool_(seq_feats)
                    if len(mem_cache) > CACHE_LEN:
                        cache.popitem(last=False)

                feat_vals = seq_feats[i_start:i_end,i_S]
                feat_vals = self.detect_bounds(feat_vals)

                # If necesary, pads with zeroes
                L = i_end - i_start
                assert L == feat_vals.shape[0]
                if L < w_size:
                    tmp = np.zeros((w_size))
                    tmp[-L:] = feat_vals
                    feat_vals = tmp

                assert feat_vals.shape[0] == len(seq)
                feat_name = 'B_' + S
                return feat_vals, feat_name


            if include_impulses:
                out += [feat1,feat2]
            else:
                out += [feat1]

        def get_parse_time():
            return PARSE_TIME

        def reset():
            mem_cache.clear()
            global PARSE_TIME
            PARSE_TIME = 0

        return out, get_parse_time, reset

    #######################
    # Token-based Feature #
    #######################
    def extract_token_feats(self, seq):
        tokens, tok_index = self.tokenize(seq)
        all_tokens = self.term_nodes()
        tok2feat = {f:i for i,f in enumerate(all_tokens)}

        # Generates the word-level features
        w_feats = np.zeros((len(tokens), len(all_tokens)))
        for i,tok in enumerate(tokens):
            j = tok2feat[tok]
            w_feats[i,j] = 1

        # Expands to char-level features
        feats = self.word_to_char_feat(seq, tok_index, w_feats)
        return feats

    def token_features(self, name_prefix=''):
        # Sets buffers up
        if hasattr(self, 'token_feats'):
            print('Reinitializing the token-based features buffer')
        self.token_feats = {}

        out = []
        symbols = self.term_nodes()
        for i_S, S in enumerate(symbols):

            # Creates the feature function
            # (default arg assignment prevent late binding)
            def feat(seq, i_S=i_S, S=S):
                # If not buffered, process the sequence
                if seq not in self.token_feats:
                    self.token_feats[seq] = self.extract_token_feats(seq)
                # Fetches the value
                feat_vals = self.token_feats[seq][:,i_S]
                feat_name = name_prefix + S
                return feat_vals, feat_name

            out.append(feat)

        return out

    ##############
    # Test model #
    ##############
    def test_model(self, model, batch_size, n_batches, max_len, char2int, w_size,
                start_chars=' SELECT', end_char='$', padding_char = '~'):
        # Generates sentences
        S = []
        start_chars = start_chars.rjust(w_size, padding_char)
        for _ in range(n_batches):
            S += generator.generate_sentences(
                start_chars, end_char, max_len, model, char2int, batch_size)
        print('Generated', len(S), 'sentences')

        # Parses
        def check(s):
            tokens, _ = self.tokenize(s,ignore_errors=True)
            parse = self.parse(tokens)
            if parse is not None and len(parse) > 0:
                return True
            else:
                return False

        n_ok = 0
        for s in S: n_ok += check(s)
        return n_ok * 1.0 / len(S)

In [4]:
def slide_window(sequences, w_size=-1, step_size=1, \
                 n_test_tuples=float('Inf'), pad_char='~'):
    if w_size == -1:
        w_size = max(len(S) for S in sequences)
    X = []
    y = []
    prov_index = {}
    for i_seq, seq in enumerate(sequences):
        steps = range(step_size, len(seq), step_size)
        if len(steps) == 0:
            steps = [len(seq)-1]
        for i in steps:
            end = i
            start = max(0, i-w_size)
            s = seq[start:end]
            assert len(s) <= w_size
            x = s.rjust(w_size, pad_char)
            assert len(x) == w_size
            X.append(x)
            y.append(seq[i])
            prov_index[x] = (i_seq, start, end)
            if len(X) > n_test_tuples:
                break
        if len(X) > n_test_tuples:
                break

    assert len(X) == len(y)
    X, y = adjust_to_batch(X,y)
    print('Will use', len(X), 'sentences')
    return X, y, prov_index, i_seq

def adjust_to_batch(X, y):
    assert len(X) == len(y)
    N = len(X)
    N -= N % PARAMS['batch_size']
    assert N > 0
    X = X[:N]
    y = y[:N]
    return X,y

def generate_seq(grammar, n_seq, n_tuples=None, step=None, w_size=None):
    if n_tuples is None:
        sequences = grammar.generate_sequences(
            n_seq, len_max=PARAMS['max_seq_size'])
    else:
        sequences = []
        n_tuples_gen = 0
        while n_tuples_gen <= n_tuples:
            seqs = grammar.generate_sequences(
                n_seq, len_max=PARAMS['max_seq_size'])
            sequences += seqs
            ntups = lambda s: max((len(s)-w_size)//n_seq, 1)
            n_gen = sum(ntups(s) for s in seqs)
            n_tuples_gen += n_gen
    print ('Done. Generated a total of:', len(sequences), 'sentences')
    return sequences

In [5]:
grammar = Grammar(pcfg_file='sql_full_XL.pcfg')
n_rules = grammar.n_rules()
n_feats = grammar.n_features()

# Generates original sequences
print('\n** Generates data')
# Fixed seed for repeatability
random.seed(55555)
n_seq_test = PARAMS["n_seq_test"]
n_test_tuples = PARAMS["n_tup_test"][0]

train_sequences = generate_seq(grammar, PARAMS['n_seq_train'])
test_sequences = generate_seq(grammar, n_seq_test, \
                                n_tuples=n_test_tuples,
                                step=PARAMS['test_step'],
                                w_size=PARAMS['window_size'])

train_from, train_to, _, _ = \
    slide_window(train_sequences, PARAMS['window_size'])
test_from, test_to, prov_index, last_seq = \
    slide_window(test_sequences, PARAMS['window_size'],
                 PARAMS['test_step'], n_test_tuples=n_test_tuples)

# Adjustes n test seqs
test_sequences = test_sequences[:last_seq+1]
print('Truncated test to', len(test_sequences), 'queries')

Loading sql_full_XL.pcfg
Loaded grammar with 198 rules
Grammar rules before removing zero-rules: 280
Grammar rules after removing zero-rules: 272
Created parser

** Generates data
Done. Generated a total of: 128 sentences
Done. Generated a total of: 200 sentences
Will use 14848 sentences
Will use 128 sentences
Truncated test to 5 queries


In [291]:
PARAMS['batch_size']

128

In [6]:
len(prov_index)

125

In [7]:
PARAMS['window_size']

30

In [None]:
char2int

In [478]:
def make_dict(sequences):
    chars = set()
    for s in sequences:
        for c in s:
            chars.add(c)
    return {c:i for i,c in enumerate(list(chars))}

char2int = make_dict(train_from + test_from+train_to +test_to)

In [480]:
char2int = {' ': 72, '$': 23, "'": 12, '(': 68, ')': 8, '*': 71, '+': 33, ',': 24, '-': 26, '.': 2,
 '/': 5, '0': 22, '1': 6, '2': 43, '3': 38, '4': 14, '5': 58, '6': 66, '7': 18, '8': 55, '9': 69,
 '<': 52, '=': 57, '>': 60, 'A': 1, 'B': 47, 'C': 9, 'D': 40, 'E': 62, 'F': 59, 'G': 21, 'H': 32,
 'I': 70, 'J': 56, 'L': 15, 'M': 50, 'N': 63, 'O': 7, 'P': 31, 'R': 27, 'S': 39, 'T': 51, 'U': 25,
 'V': 29, 'W': 49, 'Y': 20, 'a': 61, 'b': 64, 'c': 44, 'd': 37, 'e': 16, 'f': 54, 'g': 11, 'h': 28,
 'i': 67, 'j': 17, 'k': 46, 'l': 3, 'm': 13, 'n': 42, 'o': 65, 'p': 41, 'q': 34, 'r': 10, 's': 19, 
't': 36, 'u': 48, 'v': 53, 'w': 30, 'x': 4, 'y': 45, 'z': 35, '~': 0}

In [10]:
class TwoDimEncoders:

    @staticmethod
    def raw_to_encoded(seqs, seqs2=None, cust_char2int=None):

        if cust_char2int is None:
            all_chars = reduce(lambda chars,seq : set(chars) | set(seq), seqs)
            if seqs2 is not None:
                all_chars |= reduce(lambda chars,seq : set(chars) | set(seq),
                                    seqs2)
            all_chars = sorted(all_chars)

            char2int =  {c:i for i, c in enumerate(all_chars)}
            print "Total vocabulary len_sequence: ", len(all_chars)
        else:
            char2int = cust_char2int

        encoded_seqs = [[char2int[x] for x in seq] for seq in seqs]
        if seqs2 is None:
            return encoded_seqs, char2int

        encoded_seqs2 = [[char2int[x] for x in seq] for seq in seqs2]
        return encoded_seqs, char2int, encoded_seqs2

    @staticmethod
    def encoded_to_bin_tensor(enc_seqs, char2int, enc_seqs2=None,
                                start_at_min=False):

        min_x = min(char2int.values()) if start_at_min else 0
        n_chars = max(char2int.values()) - min_x + 1

        seq_len = len(enc_seqs[0])

        X = np.zeros((len(enc_seqs), seq_len, n_chars), dtype=np.int)
        for i, enc_seq in enumerate(enc_seqs):
            for j, x in enumerate(enc_seq):
                k = x - min_x
                X[i, j, k] = 1

        if enc_seqs2 is None:
            return X

        X2 = np.zeros((len(enc_seqs2), seq_len, n_chars), dtype=np.int)
        for i, enc_seq in enumerate(enc_seqs2):
            for j, x in enumerate(enc_seq):
                k = x - min_x
                X2[i, j, k] = 1

        return X, X2

    @staticmethod
    def raw_to_bin_tensor(seqs, seqs2=None, cust_char2int=None):
        encoding_out = TwoDimEncoders.raw_to_encoded(seqs, seqs2, cust_char2int)
        if len(encoding_out) == 2:
            encoded_seqs, char2int = encoding_out
            encoded_seqs2 = None
        else:
            encoded_seqs, char2int, encoded_seqs2 = encoding_out

        bin_tensors = TwoDimEncoders.encoded_to_bin_tensor(
            encoded_seqs, char2int, encoded_seqs2)

        if type(bin_tensors) is not tuple:
            return bin_tensors, char2int

        out = bin_tensors + (char2int,)
        return out

X_train, X_test, char2int = \
    TwoDimEncoders.raw_to_bin_tensor(\
        train_from, test_from, cust_char2int=char2int)
    
print('X matrices:', X_train.shape, X_test.shape)

class OneDimEncoders:

    @staticmethod
    def raw_to_encoded(seq, seq2=None, cust_char2int=None):

        if cust_char2int is None:
            all_chars = set(seq)
            if seq2 is not None:
                all_chars |= set(seq2)
            all_chars = sorted(all_chars)

            char2int =  {c:i for i, c in enumerate(all_chars)}
        else:
            char2int = cust_char2int

        encoded_seq = [char2int[x] for x in seq]
        if seq2 is None:
            return encoded_seq, char2int

        encoded_seq2 = [char2int[x] for x in seq2]
        return encoded_seq, char2int, encoded_seq2

    @staticmethod
    def encoded_to_bin_tensor(enc_seq, char2int, enc_seq2=None,
                                start_at_min=False, add_dim=False):

        min_x = min(char2int.values()) if start_at_min else 0
        n_chars = max(char2int.values()) - min_x + 1

        X = np.zeros((len(enc_seq), n_chars), dtype=np.int)
        for i, x in enumerate(enc_seq):
            j = x - min_x
            X[i, j] = 1

        if add_dim:
            X = X[:,np.newaxis,:]

        if enc_seq2 is None:
            return X

        X2 = np.zeros((len(enc_seq2), n_chars), dtype=np.int)
        for i, x in enumerate(enc_seq2):
            j = x - min_x
            X2[i, j] = 1

        if add_dim:
            X2 = X2[:,np.newaxis,:]

        return X, X2

    @staticmethod
    def raw_to_bin_tensor(seq, seq2=None, cust_char2int=None, add_dim=False):
        encoding_out = OneDimEncoders.raw_to_encoded(seq, seq2, cust_char2int)
        if len(encoding_out) == 2:
            encoded_seq, char2int = encoding_out
            encoded_seq2 = None
        else:
            encoded_seq, char2int, encoded_seq2 = encoding_out

        bin_tensors = OneDimEncoders.encoded_to_bin_tensor(
            encoded_seq, char2int, encoded_seq2, add_dim=add_dim)

        if type(bin_tensors) is not tuple:
            return bin_tensors, char2int

        out = bin_tensors + (char2int,)
        return out

    
y_train, y_test, char2int = \
    OneDimEncoders.raw_to_bin_tensor(\
        train_to, test_to, cust_char2int=char2int)
print('y matrices:', y_train.shape, y_test.shape)

X matrices: (14848, 30, 73) (128, 30, 73)
y matrices: (14848, 73) (128, 73)


In [11]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM,Lambda,Input
from keras.callbacks import ModelCheckpoint,EarlyStopping

from keras.models import load_model
from os import path


def vanilla_LSTM(X, y, n_states, n_layers = 1):
    in_dim = X.shape[1:]
    out_dim = y.shape[1]

    model = Sequential()
    model.add(LSTM(n_states, return_sequences=True,input_shape=in_dim))
    for _ in range(n_layers - 1):
        model.add(LSTM(n_states, return_sequences=True))
    model.add(Lambda(lambda x: x[:,-1, :]))
    model.add(Dense(out_dim, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])

    return model

def load_or_fit(cache, model, X_train, y_train,
                batch_size, n_epochs, validation_split, patience=5):

    if path.isfile(cache):
        model = load_model(cache)
    else:
        fit_model(cache, model, X_train, y_train,
            batch_size, n_epochs, validation_split, patience)
    return model

def fit_model(cache, model, X_train, y_train,
              batch_size, n_epochs, validation_split, patience=5):
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience)
    checkpointer = ModelCheckpoint(filepath=cache, monitor='val_loss',
                                   verbose=1, save_best_only=True)
    model.fit(X_train, y_train,
                batch_size       = batch_size,
                epochs           = n_epochs,
                validation_split = validation_split,
                callbacks        = [early_stopping, checkpointer],
                verbose          = 2)
    return model

Using TensorFlow backend.


In [12]:
print(X_train.shape)
print(y_train.shape)

from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM,Lambda,Input
from keras.callbacks import ModelCheckpoint,EarlyStopping
    
model = vanilla_LSTM(X_train, y_train,  PARAMS['n_units'][0],  PARAMS['n_layers'][0])

(14848, 30, 73)
(14848, 73)
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [13]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

import os
if not os.path.exists('track_history'):
    os.makedirs('track_history')

model.save('track_history/zero_model.h5')

checkpointer = ModelCheckpoint(filepath='track_history/models-{epoch:02d}-{val_loss:.2f}.hdf5', 
                               monitor='val_loss', save_best_only=True)

model.fit(X_train, y_train,
        batch_size       = PARAMS['batch_size'],
        epochs           = 3,
        validation_split = 0.1,
        callbacks        = [early_stopping, checkpointer],
        verbose          = 1)


Train on 13363 samples, validate on 1485 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f9d404fd828>

In [14]:
len(X_train)

14848

In [15]:
model.predict(X_train[1:1000])

array([[1.55200705e-05, 5.96504658e-04, 7.01373792e-05, ...,
        1.72621862e-03, 8.19365887e-05, 2.43108720e-03],
       [2.32807161e-05, 4.36387578e-04, 7.95411615e-05, ...,
        2.11861986e-03, 6.47683337e-05, 4.47439263e-03],
       [2.86140730e-05, 2.91232689e-04, 1.32407222e-04, ...,
        4.20412235e-03, 7.59399045e-05, 1.02809006e-02],
       ...,
       [5.63758767e-05, 5.41349547e-03, 1.03741422e-01, ...,
        1.92843180e-03, 1.04891807e-02, 7.82646686e-02],
       [5.72857643e-05, 6.55220915e-03, 1.04130030e-01, ...,
        1.79147313e-03, 1.03560546e-02, 9.09508541e-02],
       [7.30799075e-05, 8.02156795e-03, 8.57042000e-02, ...,
        1.99020025e-03, 8.93536024e-03, 1.02089636e-01]], dtype=float32)

In [16]:
def test_models(models, X_test, y_test, gram, char2int):
    for i,model in enumerate(models):
        model.reset_states()
        score = model.evaluate(X_test, y_test,
                               batch_size=PARAMS['batch_size'],
                               verbose=2)
        print('Test accuracy:', score[1])
        

test_models([model], X_test, y_test, grammar, char2int)

Test accuracy: 0.3359375


In [30]:
grammar = Grammar(pcfg_file='sql_full_XL.pcfg')

Loading sql_full_XL.pcfg
Loaded grammar with 198 rules
Grammar rules before removing zero-rules: 280
Grammar rules after removing zero-rules: 272
Created parser


In [32]:
pcfg_string = None
with open('sql_full_XL.pcfg', 'r') as f:
    pcfg_string = f.read()
g = PCFG.fromstring(pcfg_string)

In [33]:
g

<Grammar with 198 productions>

In [20]:
rules = [r for r in grammar.productions() if r.prob() > 0]

In [21]:
rule_index = collections.defaultdict(list)

In [22]:
for r in rules:
    rule_index[r.lhs()].append(r)

In [23]:
type(rule_index)

collections.defaultdict

In [34]:
a,b,c = grammar.sub_parse_features(test_sequences, prov_index, PARAMS['window_size'], include_impulses=False)

In [35]:
c()

In [36]:
a

[<function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=0, S='symb_OUTER')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=1, S='symb_FULL')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=2, S='symb_RIGHT')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=3, S='symb_LEFT')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=4, S='symb_INNER')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=5, S='symb_NOT')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=6, S='OUTER')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=7, S='FULL')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=8, S='RIGHT')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=9, S='LEFT')>,
 <function __main__.Grammar.sub_parse_features.<locals>.feat1(seq, i_S=10, S='symb_JOIN')>,
 <function __m

In [37]:
k = grammar.non_term_nodes()

In [38]:
k[0].symbol()

'symb_OUTER'

In [39]:
prov_index[test_from[127]]

(4, 115, 145)

In [40]:
len(test_from)

128

In [136]:
grammar = Grammar(pcfg_file='sql_full_XL.pcfg')
a,b = grammar.extract_parse_feats(test_sequences[0],verbose=True)

Loading sql_full_XL.pcfg
Loaded grammar with 198 rules
Grammar rules before removing zero-rules: 280
Grammar rules after removing zero-rules: 272
Created parser
197


In [112]:
len(a[0])

171

In [215]:
a.shape

(274, 171)

In [61]:
b

0.51073052799984

In [44]:
prov_index

{' )  SELECT pqohm.nl65p  AS bpx': (0, 20, 50),
 ' AND ..false..<>.fj. ORDER BY ': (0, 80, 110),
 ' AS u9z.* xvl9 AS y7 WHERE g2m': (2, 20, 50),
 ' AS zlall FULL JOIN rehgis AS ': (4, 65, 95),
 ' FROM )  SELECT pqohm.nl65p  A': (0, 15, 45),
 ' JOIN l5  ON q3<=to4uao.tc WHE': (0, 210, 240),
 ' ORDER BY ff DESC ) AS r08z un': (0, 100, 130),
 ' SELECT *, ix.* FROM )  SELECT': (0, 0, 30),
 ' SELECT 38.22300, opbu.*, tlf.': (1, 0, 30),
 ' SELECT ad0 FROM e2l AS u9z.* ': (2, 0, 30),
 ' SELECT mwfl.* ORDER BY v1d.rk': (3, 0, 30),
 ' SELECT zo.v7ljvk, bxs3.etz  A': (4, 0, 30),
 ' b0.ssey  AS ydfygj FROM d4 AS': (1, 45, 75),
 ' ix.* FROM )  SELECT pqohm.nl6': (0, 10, 40),
 ' l5  ON q3<=to4uao.tc WHERE f9': (0, 215, 245),
 ' nufxa WHERE cn6f.ssi4<=71.070': (1, 75, 105),
 ' pqohm.nl65p  AS bpxhg FROM yc': (0, 30, 60),
 ' q36 AS w2orpr INNER JOIN l5  ': (0, 190, 220),
 ' t8a5gu, false, wtl.* FROM vz ': (0, 150, 180),
 ' tlf.z03n2w, i6fc.*, b0.ssey  ': (1, 25, 55),
 ' w61.j0sc<>lmvn6 WHERE 01270=l'

In [45]:
a[0:30,0].shape

(30,)

In [46]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 30, 128)           103424    
_________________________________________________________________
lambda_1 (Lambda)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 73)                9417      
Total params: 112,841
Trainable params: 112,841
Non-trainable params: 0
_________________________________________________________________


In [47]:
from keras.models import Model, load_model
outputs = [model.layers[l].output for l in [0]]
spymodel = Model(inputs = model.input, outputs = outputs)

In [50]:
spymodel.predict(X_test[1:52]).shape

(51, 30, 128)

In [55]:
X_test[1:128].shape

(127, 30, 73)

In [60]:
with open('sqldata.txt', 'w') as f:
    for item in test_sequences:
        f.write("%s\n" % item)

In [276]:
lines = []
with open('sqldata.txt', 'r') as f:
    line = f.readline()
    while line:
        lines.append(line.split("\n")[0])
        line = f.readline()

In [279]:
lines == test_sequences

True

In [258]:
type(test_sequences)

list

In [237]:
from nltk.grammar import is_terminal, is_nonterminal
term_nodes = [S for S in symbols if is_terminal(S)]
terminals = set(term_nodes)
L = max(len(w) for w in terminals)
def terminal(i, expr):
    for wid in reversed(range(1, L+1)):
        j = i + wid
        if j > len(expr):
            continue
        w = expr[i:j]
        if w in terminals:
            return w
    return None

tokens = []
index = []
i = 0
skipchars=['~']
while i < len(expr):
    if not expr[i] in skipchars:
        term = terminal(i, expr)
        if term is None:
            if not ignore_errors:
                raise ValueError('Could not match token', expr[i:])
            else:
                print("null")
        tokens.append(term)
        index.append((i,len(term)))
        i += len(term)
    else:
        i += 1

In [137]:
grammar.grammar

<Grammar with 272 productions>

In [240]:
temmp = symbols

In [245]:
symbols = None

In [342]:
expr = test_sequences[0]
def generate_parsetree(inputdata):
    import warnings
    import copy
    import collections
    import time, timeit
    import random
    import numpy as np
    import nltk
    from nltk import Tree
    from nltk import PCFG,CFG
    from nltk.grammar import is_terminal, is_nonterminal, ProbabilisticProduction, Nonterminal
    from nltk.probability import DictionaryProbDist
    expr = inputdata
    gram = None
    with open('sql_full_XL.pcfg', 'r') as f:
        pcfg_string = f.read()
        from nltk import PCFG
        gram = PCFG.fromstring(pcfg_string)
    rules = [r for r in gram.productions() if r.prob() > 0]
    rule_index = collections.defaultdict(list)
    for r in rules:
        rule_index[r.lhs()].append(r)
    ix = rule_index
    len_ix = {lhs:len(rules) for lhs,rules in ix.items()}
    counter = 0
    term_index = {}
    new_rules = []
    for rule in gram.productions():
        # If Nonterm := Term or Nontern := Nonterm rule, skip
        if len_ix[rule.lhs()]==1 and len(rule.rhs())==1:
            new_rules.append(rule)
            continue
        # Otherwise creates a new rule
        new_rhs = []
        for r in rule.rhs():
            if is_nonterminal(r) :
                new_rhs.append(r)
            else:
                if r not in term_index:
                    new_left = Nonterminal('symb_'+str(r))
                    prule = ProbabilisticProduction(new_left,
                                                    [r],
                                                    prob=1.0)
                    term_index[r]=prule
                new_rhs.append(new_left)
        new_rule = ProbabilisticProduction(rule.lhs(),
                                           new_rhs,
                                           prob=rule.prob())
        new_rules.append(new_rule)
    new_rules += term_index.values()
    gram = PCFG(gram.start(), new_rules)
    rules = [r for r in gram.productions() if r.prob() > 0]
    rule_index = collections.defaultdict(list)
    for r in rules:
        rule_index[r.lhs()].append(r)
    old_grammar = gram
    # BFS
    arules = set()
    visited = set()
    Q = collections.deque()
    Q.append(gram.start())
    while len(Q) > 0:
        # Gets first symbol in queue
        symb = Q.popleft()
        # Fetches corresponding rules
        rules = rule_index[symb]
        for r in rules:
            arules.add(r)
            for symb in r.rhs():
                if is_nonterminal(symb) and not symb in visited:
                    Q.append(symb)
                    visited.add(symb)
    # Creates new rules
    arules = list(arules)
    check_rules(arules)
    gram = PCFG(old_grammar.start(), arules)

    symbols = set()
    for rule in gram.productions():
        symbols.add(rule.lhs())
        for r in rule.rhs():
            symbols.add(r)

    
    term_nodes = [S for S in symbols if is_terminal(S)]
    terminals = set(term_nodes)
    L = max(len(w) for w in terminals)
    def terminal(i, expr):
        for wid in reversed(range(1, L+1)):
            j = i + wid
            if j > len(expr):
                continue
            w = expr[i:j]
            if w in terminals:
                return w
        return None

    tokens = []
    index = []
    i = 0
    skipchars=['~']
    while i < len(expr):
        if not expr[i] in skipchars:
            term = terminal(i, expr)
            if term is None:
                if not ignore_errors:
                    raise ValueError('Could not match token', expr[i:])
                else:
                    print("null")
            tokens.append(term)
            index.append((i,len(term)))
            i += len(term)
        else:
            i += 1

    parse = None
    for p in parser.parse(tokens):
        parse = p
        break

    rules = [r for r in gram.productions() if r.prob() > 0]
    rule_index = collections.defaultdict(list)
    for r in rules:
        rule_index[r.lhs()].append(r)
    # Gets the symbols BFS
    visited = [gram.start()]
    Q = collections.deque()
    Q.append(gram.start())

    while len(Q) > 0:
        # Gets first symbol in queue
        symb = Q.popleft()
        # Fetches corresponding rules
        rules = rule_index[symb]
        for r in rules:
            for symb in r.rhs():
                if is_nonterminal(symb) and not symb in visited:
                    Q.append(symb)
                    visited.append(symb)

    visited = list(reversed(visited))

    # Alternative method for checking correctness
    alt_visited = [S for S in symbols if is_nonterminal(S)]
    assert set(visited) == set(alt_visited)

    non_term_nodes = visited
    tree = parse
    n_tokens = len(tree.leaves())
    nt_symbols = non_term_nodes

    rule_feats = np.zeros((n_tokens, len(nt_symbols)))
    rule2feat = {s.symbol():i for i,s in enumerate(nt_symbols)}

    def visit(tree, offset):
        if isinstance(tree, Tree):
            n_tokens = 0
            for subtree in tree:
                n_tokens += visit(subtree, offset + n_tokens)
            symb = tree.label()
            j = rule2feat[symb]
            rule_feats[offset:offset+n_tokens, j] = 1
            return n_tokens
        else:
            return 1

    visit(tree, 0)
    seq = expr
    lex_index = index
    w_feats = rule_feats


    ch_len = len(seq)
    w_len  = w_feats.shape[0]
    n_feats = w_feats.shape[1]
    ch_feats = np.zeros((ch_len, n_feats))

    for i_w in range(w_len):
        f_w  = w_feats[i_w,...]
        i_ch,len_ch = lex_index[i_w]
        f_ch = np.tile(f_w, len_ch).reshape((len_ch, n_feats))
        ch_feats[i_ch:i_ch+len_ch,:] = f_ch
    return [ch_feats,nt_symbols]

In [460]:
expr = test_sequences[0]
a = generate_parsetree(test_sequences[0])

In [346]:
tree = a[0]
names = a[1]

In [461]:
def extractfeature(intermiediate, inputdata):  
    tree = intermiediate[0]
    names = intermiediate[1]
    seq= inputdata
    pad_char='~'
    w_size = 30
    step_size = 5
    if w_size == -1:
        w_size = max(len(S) for S in sequences)
    X = []
    y = []
    prov_index = {}

    steps = range(step_size, len(seq), step_size)
    if len(steps) == 0:
        steps = [len(seq)-1]
    for i in steps:
        end = i
        start = max(0, i-w_size)
        s = seq[start:end]
        assert len(s) <= w_size
        x = s.rjust(w_size, pad_char)
        assert len(x) == w_size
        X.append(x)
        y.append(seq[i])
        prov_index[x] = (start, end)

    featuress = []
    for idx, name in enumerate(names):
        features = []
        for key, value in prov_index.items(): 
            start, end = value
            feature = tree[start:end,idx]
            L = end - start
            if L < 30:
                tmp = np.zeros((30))
                tmp[-L:] = feature
                feature = tmp
            features.append(feature)
        feat_name = 'F_' + name.symbol()
        featuress.append(features)
    return np.array(featuress)

In [462]:
f = extractfeature(a,test_sequences[0])
f.shape

(171, 54, 30)

In [490]:
seq = test_sequences[0]
pad_char='~'
w_size = 30
step_size = 5
n_test_tuples = 128
if w_size == -1:
    w_size = max(len(S) for S in sequences)
X = []
y = []
prov_index = {}

steps = range(step_size, len(seq), step_size)
if len(steps) == 0:
    steps = [len(seq)-1]
for i in steps:
    end = i
    start = max(0, i-w_size)
    s = seq[start:end]
    assert len(s) <= w_size
    x = s.rjust(w_size, pad_char)
    assert len(x) == w_size
    X.append(x)
    y.append(seq[i])
    prov_index[x] = (i_seq, start, end)
    if len(X) > n_test_tuples:
        break

test_from = X
test_to = y 


In [491]:
char2int = {' ': 72, '$': 23, "'": 12, '(': 68, ')': 8, '*': 71, '+': 33, ',': 24, '-': 26, '.': 2,
 '/': 5, '0': 22, '1': 6, '2': 43, '3': 38, '4': 14, '5': 58, '6': 66, '7': 18, '8': 55, '9': 69,
 '<': 52, '=': 57, '>': 60, 'A': 1, 'B': 47, 'C': 9, 'D': 40, 'E': 62, 'F': 59, 'G': 21, 'H': 32,
 'I': 70, 'J': 56, 'L': 15, 'M': 50, 'N': 63, 'O': 7, 'P': 31, 'R': 27, 'S': 39, 'T': 51, 'U': 25,
 'V': 29, 'W': 49, 'Y': 20, 'a': 61, 'b': 64, 'c': 44, 'd': 37, 'e': 16, 'f': 54, 'g': 11, 'h': 28,
 'i': 67, 'j': 17, 'k': 46, 'l': 3, 'm': 13, 'n': 42, 'o': 65, 'p': 41, 'q': 34, 'r': 10, 's': 19, 
't': 36, 'u': 48, 'v': 53, 'w': 30, 'x': 4, 'y': 45, 'z': 35, '~': 0}

In [504]:
class TwoDimEncoders:

    @staticmethod
    def raw_to_encoded(seqs, seqs2=None, cust_char2int=None):

        if cust_char2int is None:
            all_chars = reduce(lambda chars,seq : set(chars) | set(seq), seqs)
            if seqs2 is not None:
                all_chars |= reduce(lambda chars,seq : set(chars) | set(seq),
                                    seqs2)
            all_chars = sorted(all_chars)

            char2int =  {c:i for i, c in enumerate(all_chars)}
        else:
            char2int = cust_char2int

        encoded_seqs = [[char2int[x] for x in seq] for seq in seqs]
        if seqs2 is None:
            return encoded_seqs, char2int

        encoded_seqs2 = [[char2int[x] for x in seq] for seq in seqs2]
        return encoded_seqs, char2int, encoded_seqs2

    @staticmethod
    def encoded_to_bin_tensor(enc_seqs, char2int, enc_seqs2=None,
                                start_at_min=False):

        min_x = min(char2int.values()) if start_at_min else 0
        n_chars = max(char2int.values()) - min_x + 1

        seq_len = len(enc_seqs[0])

        X = np.zeros((len(enc_seqs), seq_len, n_chars), dtype=np.int)
        for i, enc_seq in enumerate(enc_seqs):
            for j, x in enumerate(enc_seq):
                k = x - min_x
                X[i, j, k] = 1

        if enc_seqs2 is None:
            return X

        X2 = np.zeros((len(enc_seqs2), seq_len, n_chars), dtype=np.int)
        for i, enc_seq in enumerate(enc_seqs2):
            for j, x in enumerate(enc_seq):
                k = x - min_x
                X2[i, j, k] = 1

        return X, X2

    @staticmethod
    def raw_to_bin_tensor(seqs, seqs2=None, cust_char2int=None):
        encoding_out = TwoDimEncoders.raw_to_encoded(seqs, seqs2, cust_char2int)
        if len(encoding_out) == 2:
            encoded_seqs, char2int = encoding_out
            encoded_seqs2 = None
        else:
            encoded_seqs, char2int, encoded_seqs2 = encoding_out

        bin_tensors = TwoDimEncoders.encoded_to_bin_tensor(
            encoded_seqs, char2int, encoded_seqs2)

        if type(bin_tensors) is not tuple:
            return bin_tensors, char2int

        out = bin_tensors + (char2int,)
        return out

X_test, char2int = TwoDimEncoders.raw_to_bin_tensor(test_from, cust_char2int=char2int)
    

class OneDimEncoders:

    @staticmethod
    def raw_to_encoded(seq, seq2=None, cust_char2int=None):

        if cust_char2int is None:
            all_chars = set(seq)
            if seq2 is not None:
                all_chars |= set(seq2)
            all_chars = sorted(all_chars)

            char2int =  {c:i for i, c in enumerate(all_chars)}
        else:
            char2int = cust_char2int

        encoded_seq = [char2int[x] for x in seq]
        if seq2 is None:
            return encoded_seq, char2int

        encoded_seq2 = [char2int[x] for x in seq2]
        return encoded_seq, char2int, encoded_seq2

    @staticmethod
    def encoded_to_bin_tensor(enc_seq, char2int, enc_seq2=None,
                                start_at_min=False, add_dim=False):

        min_x = min(char2int.values()) if start_at_min else 0
        n_chars = max(char2int.values()) - min_x + 1

        X = np.zeros((len(enc_seq), n_chars), dtype=np.int)
        for i, x in enumerate(enc_seq):
            j = x - min_x
            X[i, j] = 1

        if add_dim:
            X = X[:,np.newaxis,:]

        if enc_seq2 is None:
            return X

        X2 = np.zeros((len(enc_seq2), n_chars), dtype=np.int)
        for i, x in enumerate(enc_seq2):
            j = x - min_x
            X2[i, j] = 1

        if add_dim:
            X2 = X2[:,np.newaxis,:]

        return X, X2

    @staticmethod
    def raw_to_bin_tensor(seq, seq2=None, cust_char2int=None, add_dim=False):
        encoding_out = OneDimEncoders.raw_to_encoded(seq, seq2, cust_char2int)
        if len(encoding_out) == 2:
            encoded_seq, char2int = encoding_out
            encoded_seq2 = None
        else:
            encoded_seq, char2int, encoded_seq2 = encoding_out

        bin_tensors = OneDimEncoders.encoded_to_bin_tensor(
            encoded_seq, char2int, encoded_seq2, add_dim=add_dim)

        if type(bin_tensors) is not tuple:
            return bin_tensors, char2int

        out = bin_tensors + (char2int,)
        return out

    
y_test, char2int = OneDimEncoders.raw_to_bin_tensor(test_to, cust_char2int=char2int)


In [512]:
print(X_test.shape)
print(y_test.shape)

(54, 30, 73)
(54, 73)


In [511]:
len(test_from)

54

In [468]:
model

<keras.engine.sequential.Sequential at 0x7f9d834f5be0>

In [471]:
from keras.models import Model, load_model
newmodel = load_model("track_history/models-03-2.78.hdf5")
newmodel._make_predict_function()
layer = [0]
outputs = [newmodel.layers[l].output for l in layer]
newspymodel = Model(inputs = newmodel.input, outputs = outputs)

In [513]:
newspymodel.predict(X_test).shape

(54, 30, 128)

In [475]:
X_test.shape

(128, 30, 73)