### After running the Scrapy scraper, the data needs to be processed. This wrangler is structured to handle the scraper's output file for the Examination type.

In [None]:
import json
import pandas as pd

#### PSLE

In [None]:
l = list()

with open('./scraper/export/psle.jl', 'r') as File:
    for line in File:
        l.append((json.loads(line)))
    File.close()

In [None]:
#for each json line, extract key/value into a tuple. This is based on the Item pipeline structure from Scrapy
list_of_dfs_with_metadata = [(j['tables'][0], j['region'], j['district'], j['school']) for j in l]

#Make a list of just the DFs
list_of_dfs = [df[0] for df in list_of_dfs_with_metadata]

In [None]:
#Attach the metadata to the actual DataFrame as new columns. Though rare, we do expect some errors might occur
#as a result of a pipeline failure. Currently, one example is a whole school's scores being invalidated (*W marker).
#In this instance, nothing went wrong with the scraping, but the process doesn't account for returning a school
#dataframe that is empty. Again, that HTML is still available for the school but because no other *W rows are saved,
#we are okay with this school being omitted to maintain consistency.
omit_error_indices = []
for num, item in enumerate(list_of_dfs_with_metadata):
    try:
        item[0]['region'] = item[1]
        item[0]['district'] = item[2]
        item[0]['school'] = item[3]
    except Exception as e:
        print(num, e, item)
        omit_error_indices.append(num)

In [None]:
omit_error_indices

In [None]:
#Map is nice
for omit in omit_error_indices:
    list_of_dfs.pop(omit) #don't run this twice! :) 
    
all_psle = pd.concat([df for df in map(pd.DataFrame, list_of_dfs)])

In [None]:
all_psle.to_csv('./CompleteDatasets/necta_psle_2018.csv', index=False)