# Levenshtein distance between synthetic data and student programs

The purpose of this notebook is to compute the edit distance between the synthetic data generated by our grammars and the original student programs. Synthetic data is used to train the neural network, which is then evaluated on student programs. 

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

In [50]:
from autoplan.trainer import ClassifierTrainer, option_combinations
from autoplan.dataset import PrelabeledDataset, build_synthetic_dataset, build_prelabeled_dataset
from autoplan.generator import ProgramGenerator
from autoplan.vis import plot_accuracy, plot_cm, plot_loss
from autoplan.token import TokenType, PyretTokenizer, OCamlTokenizer
from scripts.rainfall_ingest import ingest_dataset

from grammars.rainfall.ocaml import Program
from grammars.rainfall.labels import GeneralRainfallLabels

from tqdm import tqdm_notebook as tqdm
import pandas as pd
import torch
import os
import numpy as np
import torch.nn as nn
import pprint
# import matplotlib.pyplot as plt
# import seaborn as sns

device = torch.device('cpu')
# device = torch.device('cuda:0')
REPO_DIR = os.path.expanduser('~/autoplan')

# Student dataset

In

In [40]:
# We specify the student dataset (OCaml solutions) 
dataset_name = 'T1'
student_dataset = ingest_dataset(dataset_name, preprocess=True, exclude=[TokenType.String])

# Synthetic dataset

In

In [59]:
synthetic_dataset = build_synthetic_dataset(
    GeneralRainfallLabels,
    N=100,
    tokenizer=OCamlTokenizer(),
    generator=ProgramGenerator(grammar=Program()),
    vocab_index=student_dataset.vocab_index)

Generating programs...
Generated 71 unique programs.
Tokenizing programs...
0
program: let  helper_name 
        list_name addition_var counter_var =  
    
    match list_name with
    | [] -> if counter_var > 0 then addition_var /. counter_var else 0.
    | head :: tail -> helper_name [] addition_var counter_var
    | head :: tail -> if head >= 0 -> helper_name tail (head + addition_var) (counter_var + 1) else helper_name tail addition_var counter_var

;;

let rec rainfall list_name =
    helper_name list_name 0. 0.;;
    
tokens: []
choices: [('START', 0), ('strategy', 0), ('recursion', 1), ('uses_annotation', 1), ('rainfall_body_specs', 0), ('recursion_strategy', 1), ('_type', 1), ('check_empty_list', 0), ('fail_message', 2), ('helper_in_body', 1), ('raises_failwith', 1), ('if_statement', 0), ('then_statement', 1), ('check_div_by_zero', 0), ('gt_zero', 0), ('separate_sentinel_check', 1), ('recurse_empty_list', 0), ('check_positive_head', 0)]
options: {'START': [(1.0, None)], 'strat

# Model

In

In [52]:
model_options = {
    'model' : nn.GRU,
    'hidden_size' : 512,
    'embedding_size' : 512
}

trainer = ClassifierTrainer(synthetic_dataset, device, model_options)

In [53]:
losses = []
train_eval = []
val_eval = []

for _ in tqdm(range(50)):
    losses.append(trainer.train_one_epoch())
    train, val = trainer.eval()
    train_eval.append(train)
    val_eval.append(val)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




RuntimeError: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0