In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup 
import requests
import time

In [None]:
#Get the year's HTML page
page = requests.get("https://www.necta.go.tz/results/2017/acsee/index.htm").text
soup = BeautifulSoup(page, "lxml")

In [None]:
#Put all the links in a list
schoolUrls2017 = ["https://www.necta.go.tz/results/2017/acsee/"+i['href'] for i in soup.find_all("table")[2].find_all("a")]

In [None]:
print(schoolUrls2017[:3])
len(schoolUrls2017)

In [None]:
#Some helper functions

def list_subjects(cell):
    subject_list = [subject.replace("'", '') for subject in re.split("'\s", cell)]
    #subject_list.sort() #thought I may want to sort the list, but I don't think that step is necessary.
    return subject_list

def pull_subject(subj):
    return re.search(r'(^[A-Za-z\/]*)', subj).group()

def make_subject_columns(df):
    #Slight inspired by this: https://stackoverflow.com/questions/44663903/pandas-split-column-of-lists-of-unequal-length-into-multiple-columns
    for row in df.itertuples():
        for subject in row[5]:
            df.loc[row[0], pull_subject(subject)] = subject
    return df

def clean_grades(df):
    df.fillna("X", inplace=True)
    df.iloc[:, 5:] = df.iloc[:, 5:].applymap(lambda x: x[-1])
    del df["DETAILED SUBJECTS"]
    return df

#Put all the tables in a list, then concat the tables into a single dataframe
all_acsee = []
for i in schoolUrls2017:
    try:
        alevel = pd.read_html(i, header=0)[0]

        #Listify the list of subject grades, grades for each row in Detailed Subjects column
        alevel.loc[:,"DETAILED SUBJECTS"] = alevel.loc[:,"DETAILED SUBJECTS"].apply(lambda x: list_subjects(x))

        #Split the grades and make columns for them
        make_subject_columns(alevel)

        #Tidy up grade data
        clean_grades(alevel)

        #Add this school's table to the list of all_acsee schools
        all_acsee.append(alevel)
        time.sleep(6)
    except Exception as e:
        print((schoolUrls2017.index(i), i, e))
        time.sleep(60)
        continue

all_acsee_df = pd.concat(all_acsee, axis=0, ignore_index=True)
all_acsee_df.head()

#F column is: all_acsee_fullDF.rename(columns={'F': 'F & HN NUTRITION'}, inplace=True)
#Empty string column, '', is *W *E *R students who don't have any subject data. del all_acsee_fullDF['']
#Re-order the columns by calling .columns and copy-pasting into a subselect in preferred order

In [None]:
all_acsee_df.columns
all_acsee_df.rename(columns={'F': 'F & HN NUTRITION'}, inplace=True)
del all_acsee_df['']
all_acsee_df = all_acsee_df[['CNO', 'SEX', 'AGGT', 'DIV', 'ACCOUNTANCY', 'ADV/MATHS', 'AGRICULTURE', 'ARABIC', 'BAM',
       'BIOLOGY', 'CHEMISTRY', 'COMMERCE', 'COMP/SCIENCE',
       'DIVINITY', 'ECONOMICS', 'ENGLISH', 'F & HN NUTRITION', 'FRENCH', 'G/STUDIES', 'GEOGR',
       'HISTORY', 'IS/KNOWLEDGE', 'KISWAHILI', 'PHYSICS']]

all_acsee_df.to_csv("./CompleteDatasets/necta_acsee_2017.csv", index=False)

In [None]:
len(all_acsee_df.columns)

In [None]:
#make long form
alvl.melt(id_vars=["CNO", "SEX", "AGGT", "DIV"], var_name="Subjects", value_name="Grade").sort_values("CNO")

# Grabbing Metadata

In [None]:
import pickle
import time

acsee = pd.read_csv('CompleteDatasets/necta_acsee_2018.csv')
centers = dict.fromkeys(acsee['CNO'].apply(lambda x: x.split('/')[0]).unique().tolist())

for k,v in centers.items():
    centers[k] = {'url': 'https://www.necta.go.tz/results/2018/acsee/results/'+k.lower()+'.htm'}
    html_tables = pd.read_html(centers[k]['url']) #p's don't have meta-tables
    try:
        centers[k]['rankings'] = html_tables[2]
        centers[k]['div_perform'] = html_tables[4]
        centers[k]['subj_perform'] = html_tables[6]
    except IndexError:
        centers[k]['rankings'] = None
        centers[k]['div_perform'] = None
        centers[k]['subj_perform'] = None
    print(centers[k])
    time.sleep(3)

pickle.dump(centers, open('CompleteDatasets/centers_meta_2018.pkl', 'wb'))
pickle.load(open("CompleteDatasets/centers_meta_2018.pkl", "rb"))

Tanzania ina mifumo 2 ya utoaji elimu.
1. Mfumo rasmi wa utoaji elimu ambapo wanafunzi wanasoma kutoka shule ya awali, msingi sekondari hadi chuo kikuu
2. Mfumo usio rasmi wa utoaji elimu ambaopo mwanafunzi anajisomea mwenyewe kwenye vituo vya elimu na kisha wanafanya mtihani na inatambulika.

Sasa unapokuja kwenye mitihani. Kituo cha mtihani yaani shule inaweza pia kuwa na kituo cha kufanya mtihani hapo sasa utaona tofauti ya namba.
Wanafunzi wa shule wana namba zinazoanza na S.(means School candidate) na wale wasio rasmi wanaanza na P(means Private)
Hivyo sio shule zote zina P.
Ukiona kuna P hapo kuna wanafunzi ambao sio wanafunzi wa shule wanafanya mtihani

* S: Results suspended pending clarification of observed anomalies either in candidates' entry details, involvement in cases of irregularities or misconduct in the examination.Results suspended due to centers or schools' failing to meet registration requirements (i.e. centers with less than 35 candidates).

* E: Results withheld, pending proof of candidates' payment of requisite Examination fees.

* I: INCOMPLETE Results due to candidates' missing Continous Assessment (CA) scores in all subjects offered.

I: Incomplete results due to candidates' missing Continous Assessment(CA) scores in one or more subjects offered but not all.

* W: Results withheld/nullified or canceled due to proven candidate's involvement in cases of dishonesty or irregularities before, during or after the examinations.

* T: Results suspended due to candidates' attempting one or more subjects not registered for (pirate candidate).

ABS: Candidate missed to take the Exam.

FLD: Candidate failed the Exam.

X: Candidate did not appear to take the exam for the particular registered subject.