# Automatic grammar validation

Training a neural net to determine whether a given sentence is gramatically valid or not

The plan is
1. Take sentences
2. Randomly perturb them
3. Train a classifier to distinguish perturbed sentences from the original ones

In [1]:
import numpy as np

## Load

In [8]:
text = ''

with open('texts/alg.txt') as f:
    text += f.read()
    
text += '\n\n'

with open('texts/testament.txt') as f:
    text += f.read()

In [9]:
import nltk
from nltk.tokenize import TweetTokenizer

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = TweetTokenizer()

In [10]:
sentences = []

for sent in sent_detector.tokenize(text):
    tokens = tokenizer.tokenize(sent)
    if tokens[0].isdigit():
        tokens = tokens[1:]
    sentences.append(tokens)

In [16]:
sentences[2]

['Как',
 'ваш',
 'маленький',
 'GPS',
 'в',
 'считанные',
 'секунды',
 'на',
 '\xad',
 'ходит',
 'самый',
 'быстрый',
 'пуrь',
 'из',
 'несметного',
 'множества',
 'возможных',
 'маршруrов',
 '?']

In [23]:
nltk.pos_tag_sents(sentences[2:4], lang='rus')

[[('Как', 'CONJ'),
  ('ваш', 'A-PRO=m'),
  ('маленький', 'A=m'),
  ('GPS', 'NONLEX'),
  ('в', 'PR'),
  ('считанные', 'V'),
  ('секунды', 'S'),
  ('на', 'PR'),
  ('\xad', 'S'),
  ('ходит', 'V'),
  ('самый', 'A-PRO=m'),
  ('быстрый', 'A=m'),
  ('пуrь', 'S'),
  ('из', 'PR'),
  ('несметного', 'A=n'),
  ('множества', 'S'),
  ('возможных', 'A=pl'),
  ('маршруrов', 'S'),
  ('?', 'NONLEX')],
 [('Когда', 'CONJ'),
  ('вы', 'S-PRO'),
  ('покупаете', 'V'),
  ('что-то', 'S-PRO'),
  ('в', 'PR'),
  ('Интернете', 'S'),
  (',', 'NONLEX'),
  ('как', 'CONJ'),
  ('обеспечивается', 'V'),
  ('защита', 'S'),
  ('номера', 'S'),
  ('вашей', 'A-PRO=f'),
  ('кредитной', 'A=f'),
  ('карты', 'S'),
  ('от', 'PR'),
  ('перехвата', 'S'),
  ('злоумышленником', 'S'),
  ('?', 'NONLEX')]]

## Perturb

In [12]:
def perturb(sentence):
    return np.random.permutation(sentence)

In [13]:
def slightly_perturb(sentence):
    remove_from = np.random.randint(len(sentence))
    insert_to = np.random.randint(len(sentence))
    
    token = sentence[remove_from]
    sentence = sentence[:remove_from] + sentence[remove_from+1:]
    sentence = sentence[:insert_to] + [token] + sentence[insert_to:]
    return sentence

In [14]:
perturbed_sentences = [perturb(sent) if np.random.rand() > 0.5 else slightly_perturb(sent) for sent in sentences]

## Embed

In [61]:
import sentencepiece as spm
from gensim.models import KeyedVectors

bpe_model_location = 'ru.wiki.bpe.op1000.model'
bpe_vec_location = 'ru.wiki.bpe.op1000.d25.w2v.bin'

sp = spm.SentencePieceProcessor()
sp.Load(bpe_model_location)
bpe_model = KeyedVectors.load_word2vec_format(bpe_vec_location, binary=True)

def bpe_embed(text):
    pieces = sp.encode_as_pieces(text)
    embedding = np.zeros(bpe_model.vector_size)
    piece_count = 0

    for binary_piece in pieces:
        piece = binary_piece.decode('utf-8')
        try:
            embedding += bpe_model[piece]
            piece_count += 1
        except KeyError:
            pass

    if piece_count:
        embedding /= piece_count
            
    return embedding

In [81]:
tagger = nltk.tag._get_tagger('rus')
pos_tags = list(tagger.classes)

def embed_sentences(sentences):
    for sentence in tagger.tag_sents(sentences):
        sent_embedding = []
    
        for token, pos in sentence:
            token_embedding = bpe_embed(token)
            pos_embedding = np.zeros(len(pos_tags))
            pos_embedding[pos_tags.index(pos)] = 1
            sent_embedding.append(np.concatenate([token_embedding, pos_embedding]))
        
        yield sent_embedding

In [82]:
X = list(embed_sentences(sentences + perturbed_sentences))
y = [1 for sentence in sentences] + [0 for sentence in perturbed_sentences]

In [114]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Train

In [92]:
import torch
from torch import nn
import torch.nn.functional as F

normal_dist = torch.distributions.Normal(0, 1)

In [122]:
import itertools

lstm = nn.LSTM(136, 32, batch_first=True)
classifier = nn.Sequential(nn.Linear(64, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh(), nn.Linear(64, 1), nn.Sigmoid())
opt = torch.optim.Adam(itertools.chain(lstm.parameters(), classifier.parameters()))
def validate_grammar(sentences):
    c0, h0 = normal_dist.sample((1, 1, 32)), normal_dist.sample((1,1,32))
    _, (cn, hn) = lstm(torch.Tensor(sentences), (c0, h0))
    return classifier(torch.cat((cn[0], hn[0]), dim=1))[:,0]

In [129]:
def fit_epoch():
    for sentence, validity in zip(X_train, y_train):
        pred = validate_grammar([sentence])
        loss = F.binary_cross_entropy(pred, torch.Tensor([validity]))
        loss.backward()
        opt.step()
        opt.zero_grad()

from sklearn.metrics import accuracy_score

def test():
    print(f'train accuracy {accuracy_score(y_train, [validate_grammar([sentence]) for sentence in X_train])}')
    print(f'test accuracy  {accuracy_score(y_test, [validate_grammar([sentence]) for sentence in X_test])}')

In [None]:
fit_epoch()
test()