### After running the Scrapy scraper, the data needs to be processed. This wrangler is structured to handle the scraper's output file for the Examination type.

In [None]:
import json
import pandas as pd

#### PSLE

In [None]:
l = list()

with open('./scraper/export/psle.jl', 'r') as File:
    for line in File:
        l.append((json.loads(line)))
    File.close()

In [None]:
#for each json line, extract key/value into a tuple. This is based on the Item pipeline structure from Scrapy
list_of_dfs_with_metadata = [(j['tables'][0], j['region'], j['district'], j['school']) for j in l]

#Make a list of just the DFs
list_of_dfs = [df[0] for df in list_of_dfs_with_metadata]

In [None]:
#Attach the metadata to the actual DataFrame as new columns. Though rare, we do expect some errors might occur
#as a result of a pipeline failure. Currently, one example is a whole school's scores being invalidated (*W marker).
#In this instance, nothing went wrong with the scraping, but the process doesn't account for returning a school
#dataframe that is empty. Again, that HTML is still available for the school but because no other *W rows are saved,
#we are okay with this school being omitted to maintain consistency.
omit_error_indices = []
for num, item in enumerate(list_of_dfs_with_metadata):
    try:
        item[0]['region'] = item[1]
        item[0]['district'] = item[2]
        item[0]['school'] = item[3]
    except Exception as e:
        print(num, e, item)
        omit_error_indices.append(num)

In [None]:
#Visually confirm that all omitted indices are because of schoolwide withdrawals/other asterisked subjects
#If not, look deeper into why that school's scrape might have failed. Expected output: ['SUBJECTS', '*W']
for omit in omit_error_indices:
    print(pd.read_html(list_of_dfs[omit])[0][3].unique())

In [None]:
#If the above has been confirmed to be only all-null schools then run this cell to save those to a json file
list_of_excluded_all_null_schools = []
for omit in omit_error_indices:
    list_of_excluded_all_null_schools.append(l[omit])

with open('./CompleteDatasets/failed_scrapes_2019.json', 'w') as f:
    json.dump(list_of_excluded_all_null_schools, f)

In [None]:
filtered_dfs = [df for idx,df in enumerate(list_of_dfs) if idx not in omit_error_indices]

#Map is nice
all_psle = pd.concat([df for df in map(pd.DataFrame, filtered_dfs)])
all_psle.shape

In [None]:
all_psle.to_csv('./CompleteDatasets/necta_psle_2019.csv', index=False)

#### ACSEE
Produces three CSVs: student-level results (includes private and public/national centers), exam center performance (public/national centers), exam center subject level performance (public/national centers).

In [None]:
l = list()

with open('./scraper/export/csee.jl', 'r') as File:
    for line in File:
        l.append((json.loads(line)))
    File.close()

In [None]:
#Quick qa -- check that the number of items scraped matches the expected number
from bs4 import BeautifulSoup
import requests
page = requests.get("https://onlinesys.necta.go.tz/results/2019/csee/csee.htm").text
soup = BeautifulSoup(page, 'html.parser')

all_hrefs = [h['href'].split('/')[-1] for h in soup.find_all('table')[2].find_all('a')] #picking just the school table
#soup.find_all('table')[0] for some of the older acsee home pages that only have one table.

print('Beautiful soup found this number of links:', len(all_hrefs))
print('Scraper scraped this number of items:', len(l))
#all_hrefs[0]

In [None]:
list_of_results = []

for center in l:
    list_of_results.append(center['result_table'])
    
all_acsee = pd.concat([df for df in map(pd.DataFrame, list_of_results)], sort=False) #sort=False better for column order
#Scraper uses ! (exclamation mark) to denote empty field/NA during processing. Changing that back to np.nan.
all_acsee = all_acsee.replace({'!': pd.np.nan}).sort_values(['exam_center', 'CNO'])
all_acsee.rename(columns={'F': 'F & HN NUTRITION'}, inplace=True)
del all_acsee['']

#F column is: all_acsee_fullDF.rename(columns={'F': 'F & HN NUTRITION'}, inplace=True)
#Empty string column, '', is *W *E *R students who don't have any subject data.

all_acsee.to_csv('./CompleteDatasets/necta_csee_2019.csv', index=False)

#keeping the info below for posterity, ease of access

Tanzania ina mifumo 2 ya utoaji elimu.
1. Mfumo rasmi wa utoaji elimu ambapo wanafunzi wanasoma kutoka shule ya awali, msingi sekondari hadi chuo kikuu
2. Mfumo usio rasmi wa utoaji elimu ambaopo mwanafunzi anajisomea mwenyewe kwenye vituo vya elimu na kisha wanafanya mtihani na inatambulika.

Sasa unapokuja kwenye mitihani. Kituo cha mtihani yaani shule inaweza pia kuwa na kituo cha kufanya mtihani hapo sasa utaona tofauti ya namba.
Wanafunzi wa shule wana namba zinazoanza na S.(means School candidate) na wale wasio rasmi wanaanza na P(means Private)
Hivyo sio shule zote zina P.
Ukiona kuna P hapo kuna wanafunzi ambao sio wanafunzi wa shule wanafanya mtihani

* S: Results suspended pending clarification of observed anomalies either in candidates' entry details, involvement in cases of irregularities or misconduct in the examination.Results suspended due to centers or schools' failing to meet registration requirements (i.e. centers with less than 35 candidates).

* E: Results withheld, pending proof of candidates' payment of requisite Examination fees.

* I: INCOMPLETE Results due to candidates' missing Continous Assessment (CA) scores in all subjects offered.

I: Incomplete results due to candidates' missing Continous Assessment(CA) scores in one or more subjects offered but not all.

* W: Results withheld/nullified or canceled due to proven candidate's involvement in cases of dishonesty or irregularities before, during or after the examinations.

* T: Results suspended due to candidates' attempting one or more subjects not registered for (pirate candidate).

ABS: Candidate missed to take the Exam.

FLD: Candidate failed the Exam.

X: Candidate did not appear to take the exam for the particular registered subject.

In [None]:
list_of_rankings = []

for center in l:
    list_of_rankings.append(center['rankings_table'])
    
all_acsee_rankings = pd.concat([df for df in map(pd.DataFrame, list_of_rankings)], sort=False) #sort=False better for column order
all_acsee_rankings = all_acsee_rankings.pivot(index='exam_center', columns='category', values='rank').reset_index()

list_of_div_perform = []
for center in l:
    list_of_div_perform.append(center['div_performance_table'])
all_acsee_div_perform = pd.concat([df for df in map(pd.DataFrame, list_of_div_perform)], sort=False) #sort=False better for column order
all_acsee_div_perform

#combine the two
acsee_center_outcomes = pd.merge(all_acsee_rankings,
                                 all_acsee_div_perform,
                                 how='outer',
                                 on='exam_center'
                                )

acsee_center_outcomes.to_csv('./CompleteDatasets/necta_csee_2019_center_performance.csv', index=False)

In [None]:
list_of_subj_perform = []

for center in l:
    list_of_subj_perform.append(center['subject_performance_table'])
    
all_acsee_subj_perform = pd.concat([df for df in map(pd.DataFrame, list_of_subj_perform)], sort=False) #sort=False better for column order
all_acsee_subj_perform

all_acsee_subj_perform.to_csv('./CompleteDatasets/necta_csee_2019_subject_performance.csv', index=False)
