In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import sys
sys.path.append('..')

In [8]:
import pandas as pd
import os

from grammars.rainfall.labels import GeneralRainfallLabels, DetailedRainfallLabels
from autoplan.dataset import build_prelabeled_dataset, PrelabeledDataset
from autoplan.token import OCamlTokenizer, PyretTokenizer, TokenizerError

In [9]:
REPO_DIR = os.path.expanduser('~/autoplan')
DATA_DIR = f'{REPO_DIR}/data/rainfall/raw'
CODE_DIR = f'{DATA_DIR}/Fall2013-RawData'

def read_coding_csv(name):
    return pd.read_csv(f'{DATA_DIR}/Fall2013Coding{name}.csv', index_col=0, header=None).T

In [11]:
plan_codes = pd.read_csv(f'{DATA_DIR}/PlanCodes-codes.csv')

def read_and_join_coding(name):
    coding_csv = read_coding_csv(name)

    valid_entries = coding_csv[coding_csv.PlanStructure.notnull()]

    combined_entries = valid_entries.set_index('PlanStructure').join(plan_codes.set_index('Code'))
    
    return combined_entries

In [33]:
df = read_and_join_coding('T1')
df

Unnamed: 0,ID,School,Course,Lang,Time,Prior,NumTests,TestsAccurate,TestNegatives?,TestEmpty?,...,randomSet,Note,show?,CombinedErrors,ErrorCount,Form,Detail Category,Gen Category,Repeat Computation,Which Tasks?
IP1,10,Brown,cs17,ml,25,little,7,y,y,y,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,14,Brown,cs17,ml,10,other course,14,y,y,y,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,15,Brown,cs17,ml,20,none,3,y,y,n,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,35,Brown,cs17,ml,30,2 years with APCS,7,y,y,n,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,50,Brown,cs17,ml,15,APCS; CS4,6,y,y,y,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,53,Brown,cs17,ml,30,matlab; python,9,y,y,y,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,61,Brown,cs17,ml,45,lua,6,y,y,y,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,85,Brown,cs17,ml,25,APCS,0,0,n,n,...,1,,,,0,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,86,Brown,cs17,ml,30,none,4,y,y,y,...,1,,,X-G-P;I;,2,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all
IP1,99,Brown,cs17,ml,5,matlab html,3,y,y,y,...,1,,,X-G-P;,1,(N & T & S & C) ; (D -> A),AccumStyle,SingleLoop,n,all


In [34]:
datasets = ['HS', 'T1', 'T1Acc', 'T2', 'T3Non']

In [35]:
dataset_config = {
    'T1': {
        'path': lambda id: f'{CODE_DIR}/T1/{id}.ml',
        'tokenizer': OCamlTokenizer
    },
    'T1Acc': {
        'path': lambda id: f'{CODE_DIR}/T1Acc/{id}/cs019-2013-rainfall/rainfall-program.current.arr',
        'tokenizer': PyretTokenizer
    }
}

In [54]:
def ingest_dataset(name):
    codes = read_and_join_coding(name)
    config = dataset_config[name]
    tokenizer = dataset_config[name]['tokenizer']()
    
    programs = []
    labels = []
    for _, entry in codes.iterrows():
        path = config['path'](entry.ID)
        try:
            src = open(path).read()
        except FileNotFoundError:
            continue
            
        general_label = GeneralRainfallLabels.from_string(entry['Gen Category'].strip())
        detailed_label = DetailedRainfallLabels.from_string(entry['Detail Category'].strip())
        
        try:
            list(tokenizer.tokenize(src))
        except TokenizerError as e:
            #print(e)
            #return
            continue
        
        programs.append(src)
        labels.append(general_label)
        
    return build_prelabeled_dataset(GeneralRainfallLabels, programs, labels, tokenizer)

In [55]:
for key in ['T1']:
    ds = ingest_dataset(key)
    ds.save(f'{REPO_DIR}/data/rainfall/{key}.pkl')

100%|██████████| 44/44 [00:00<00:00, 166.69it/s]
