In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../20190725_obs_sug_pipe_separated.txt', sep='|')

In [3]:
df = df[df.columns[1:]]
df.head()

Unnamed: 0,name,session,type,item,sessionset,OBS,Sug
0,Carolyn Greenberg,5157,3026,3401,48,Passage seems to be at independent reading lev...,check comprehension in similarly leveled text ...
1,Sharilyn Fetterhoff-Bacci,5157,3026,3401,50,reads carefully and with attention to punctuat...,Work on reading with expression; teach proper ...
2,Susan Barber,5157,3026,3401,50,"Good, generally accurate reader; reads in mean...",Increase oral reading stamina by reading easie...
3,Lindsay Wyman,5159,3026,3302,47,$OBS: flat intonation; passage seems slightly ...,$SUG: repeated readings to increase fluency an...
4,Sharilyn Fetterhoff-Bacci,5159,3026,3302,50,$obs: reads in longer phrases; adequate pace a...,$sug: work on reading with expression; model r...


In [4]:
train_examples = []

for r in df.iterrows():
    r_n, row_series = r
    
    obs = row_series.OBS.lower().replace('$obs:', '').replace('$obs', '').strip()
    sug = row_series.Sug.lower().replace('$sug:', '').replace('$sug', '').strip()
    difficulty = row_series.type
    kid = row_series.session
    evaluator = row_series['name']
    passage = row_series['item']
    tpl = tuple([str(kid), str(passage), str(difficulty), str(evaluator), str(obs), str(sug)])
    train_examples.append(tpl)

In [5]:
from xml.etree import ElementTree
from xml.etree.ElementTree import Element, SubElement, tostring

In [6]:
reading_id_to_obs_and_sug = {}
for ex in train_examples:
    try:
        reading_id_to_obs_and_sug[ex[:3]].append(ex[3:])
    except KeyError:
        reading_id_to_obs_and_sug[ex[:3]] = [ex[3:]]

In [7]:
root = Element('root')
for reading_id, eval_obs_sugs in reading_id_to_obs_and_sug.items():
    kid, passage, difficulty = reading_id
    ReadingExample = SubElement(root, 'ReadingExample')
    
    ReaderID = SubElement(ReadingExample, 'ReaderID')
    PassageID = SubElement(ReadingExample, 'PassageID')
    PassageType = SubElement(ReadingExample, 'PassageType')
    ReaderID.text = kid
    PassageID.text = passage
    PassageType.text = difficulty
    
    EvaluationList = SubElement(ReadingExample, 'EvaluationList')
    for eval_obs_sug in eval_obs_sugs:
        evaluator, obs, sug = eval_obs_sug
        Evaluation = SubElement(EvaluationList, 'Evaluation')
        obs_list = [x.strip().lower() for x in obs.split(';') if x]
        sug_list = [x.strip().lower() for x in sug.split(';') if x]
        Evaluator = SubElement(Evaluation, 'Evaluator')
        Evaluator.text = evaluator.lower()
        ObservationsList = SubElement(Evaluation, 'ObservationsList')
        for o in obs_list:
            Observation = SubElement(ObservationsList, 'Observation')
            Observation.text = o
        SuggestionsList = SubElement(Evaluation, 'SuggestionsList')
        for s in sug_list:
            Suggestion = SubElement(Evaluation, 'Suggestion')
            Suggestion.text = s
    

In [8]:
def indent(elem, level=0):
    i = "\n" + level*"  "
    j = "\n" + (level-1)*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for subelem in elem:
            indent(subelem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = j
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = j
    return elem        


In [9]:
len(train_examples)

222

In [10]:
out = ElementTree.ElementTree(root)
# out.write('FINAL_OBS_AND_SUGS.xml')

In [11]:
all_train_ex_json_format = []

for reading_id, eval_obs_sugs in reading_id_to_obs_and_sug.items():
    kid, passage, difficulty = reading_id
    d = {
        'ReaderID': kid,
        'PassageID': passage,
        'PassageType': difficulty,
        'Evaluations': []
    }
    for e_o_s in eval_obs_sugs:
        evaluator, observations, suggestions = e_o_s
        evaluator = evaluator.lower()
        obs_list = [x.strip().lower() for x in observations.split(';') if x]
        sug_list = [x.strip().lower() for x in suggestions.split(';') if x]
        evaluation = {
            'Evaluator': evaluator,
            'Observations': obs_list,
            'Suggestions': sug_list
        }
        d['Evaluations'].append(evaluation)
        
    all_train_ex_json_format.append(d)

In [12]:
import json
# with open('FINAL_OBS_AND_SUG.json', 'x') as f:
#     json.dump(all_train_ex_json_format, f, indent=2)