In this notebook, we're going to try to reproduce the numbers in the article published by The Citizen: <br\>
https://www.thecitizen.co.tz/magazine/success/-Understanding-Form-Six-regional-results-puzzle/1843788-4690616-rcjhhd/index.html

In [144]:
import pandas as pd

In [145]:
acsee = pd.read_csv('CompleteDatasets/necta_acsee_2018.csv')
acsee.head()

Unnamed: 0,CNO,SEX,AGGT,DIV,ACCOUNTANCY,ADV/MATHS,AGRICULTURE,ARABIC,BAM,BIOLOGY,...,ECONOMICS,ENGLISH,F & HN NUTRITION,FRENCH,G/STUDIES,GEOGR,HISTORY,IS/KNOWLEDGE,KISWAHILI,PHYSICS
0,P0101/0501,F,12,II,X,X,X,X,X,X,...,X,D,X,X,F,X,D,X,D,X
1,P0101/0502,F,17,III,X,F,X,X,X,X,...,E,X,X,X,S,E,X,X,X,X
2,P0101/0503,F,11,II,X,X,X,X,E,X,...,E,X,X,X,E,C,C,X,X,X
3,P0101/0504,F,16,III,X,X,X,X,X,X,...,X,E,X,X,F,X,S,X,E,X
4,P0101/0505,F,20,0,X,X,X,X,X,X,...,X,F,X,X,F,S,F,X,X,X


In [146]:
#Claim: 55% have scored division I or II - (overestimate)
divs = ['I', 'II', 'III', 'IV', '0']
divs_pass = ['I', 'II']
with_div = acsee[acsee['DIV'].isin(divs)] #students have an expected division
len(with_div[with_div['DIV'].isin(divs_pass)])/len(with_div)

0.5048029957668512

7/10 of Mtwara students received Div I and II  
We'd need to know in which region each center is located.

In [None]:
import pickle
import time

acsee = pd.read_csv('CompleteDatasets/necta_acsee_2018.csv')
centers = dict.fromkeys(acsee['CNO'].apply(lambda x: x.split('/')[0]).unique().tolist())

for k,v in centers.items():
    centers[k] = {'url': 'https://www.necta.go.tz/results/2018/acsee/results/'+k.lower()+'.htm'}
    html_tables = pd.read_html(centers[k]['url']) #p's don't have meta-tables
    try:
        centers[k]['rankings'] = html_tables[2]
        centers[k]['div_perform'] = html_tables[4]
        centers[k]['subj_perform'] = html_tables[6]
    except IndexError:
        centers[k]['rankings'] = None
        centers[k]['div_perform'] = None
        centers[k]['subj_perform'] = None
    print(centers[k])
    time.sleep(3)

#There's quite a bit of other meta-data that I'd want to revisit in the future without wanting to scrape again.
pickle.dump(centers, open('CompleteDatasets/centers_meta_2018.pkl', 'wb'))

In [147]:
#Let's pull out the testing centers identifiers from CNOs
acsee['centers'] = acsee['CNO'].apply(lambda x: x.split('/')[0])
acsee[['CNO', 'centers']]

Unnamed: 0,CNO,centers
0,P0101/0501,P0101
1,P0101/0502,P0101
2,P0101/0503,P0101
3,P0101/0504,P0101
4,P0101/0505,P0101
5,P0101/0506,P0101
6,P0101/0507,P0101
7,P0101/0508,P0101
8,P0101/0509,P0101
9,P0101/0510,P0101


In [148]:
import numpy as np

#Make a special function to pull region data from the Centers Metadata that's also been pickled if needed
centers = pickle.load(open("CompleteDatasets/centers_meta_2018.pkl", "rb"))
def get_region(center, centers_meta):
    try:
        return centers_meta[center]['rankings'].loc[0][1]
    except AttributeError:
        return np.nan

acsee['region'] = acsee['centers'].apply(lambda x: get_region(x, centers))
acsee['region']

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7           NaN
8           NaN
9           NaN
10          NaN
11          NaN
12          NaN
13          NaN
14          NaN
15          NaN
16          NaN
17          NaN
18          NaN
19          NaN
20          NaN
21          NaN
22          NaN
23          NaN
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29          NaN
          ...  
87529    KIGOMA
87530    KIGOMA
87531    KIGOMA
87532    KIGOMA
87533    KIGOMA
87534    KIGOMA
87535    KIGOMA
87536    KIGOMA
87537    KIGOMA
87538    KIGOMA
87539    KIGOMA
87540    KIGOMA
87541    KIGOMA
87542    KIGOMA
87543    KIGOMA
87544    KIGOMA
87545    KIGOMA
87546    KIGOMA
87547    KIGOMA
87548    KIGOMA
87549    KIGOMA
87550    KIGOMA
87551    KIGOMA
87552    KIGOMA
87553    KIGOMA
87554    KIGOMA
87555    KIGOMA
87556    KIGOMA
87557    KIGOMA
87558    KIGOMA
Name: region, Length: 87

In [149]:
#P center students are 'private' students, meaning they weren't formally educated within the system.
#We could try to include them, but they don't come with regional metadata.
#For this first set of calculations, let's not consider them when discussing performances of the region

from operator import itemgetter

#Claim: 7/10 of Mtwara students received Div I and Div II
#Follow up Claim: Mtwara is leading in proportion of DIV I and II; No Division 0 in Mtwara; Mtwara/Lindi/Geita top 3
#Verdict: All True/Reproduced

regs = [reg for reg in acsee['region'].unique().tolist() if not pd.isnull(reg)]
regs_proportions = dict.fromkeys(regs)

for reg in regs:
    reg_divs = len(acsee['DIV'][(acsee['region'] == reg) & (acsee['DIV'].isin(divs))])
    reg_pass = len(acsee['DIV'][(acsee['region'] == reg) & (acsee['DIV'].isin(divs_pass))])
    regs_proportions[reg] = round(reg_pass/reg_divs * 100, 2)

sorted(regs_proportions.items(), key = itemgetter(1))

[('KUSINI PEMBA', 32.6),
 ('MJINI MAGHARIBI', 35.25),
 ('KASKAZINI PEMBA', 35.9),
 ('DAR ES SALAAM', 46.35),
 ('MARA', 49.3),
 ('PWANI', 50.26),
 ('KIGOMA', 51.96),
 ('MWANZA', 52.18),
 ('SHINYANGA', 53.12),
 ('KATAVI', 53.4),
 ('ARUSHA', 54.08),
 ('KAGERA', 55.05),
 ('IRINGA', 55.22),
 ('SONGWE', 55.55),
 ('MBEYA', 55.8),
 ('RUVUMA', 56.53),
 ('NJOMBE', 57.07),
 ('SIMIYU', 57.45),
 ('DODOMA', 57.48),
 ('KILIMANJARO', 58.55),
 ('RUKWA', 58.71),
 ('TABORA', 59.21),
 ('TANGA', 60.33),
 ('MANYARA', 60.6),
 ('SINGIDA', 60.91),
 ('MOROGORO', 61.67),
 ('GEITA', 63.59),
 ('LINDI', 64.91),
 ('MTWARA', 68.05)]

In [150]:
#Division Breakdowns nationally - Claim (my result):
#Div I: 10% (9.5%)
#Div II: 44.4% (overestimate 40.9%)
#Div III: 40.2% (underestimate 42%)
#Div IV and 0: 4.5% (underestimate 7%)
(acsee['DIV'][acsee['DIV'].isin(divs)].value_counts()/len(acsee['DIV'][acsee['DIV'].isin(divs)])).sort_values(0)

0      0.024213
IV     0.051042
I      0.095665
II     0.409138
III    0.419942
Name: DIV, dtype: float64

In [151]:
#2 of 10 students who scored Div 0 were from Dar
acsee[acsee['DIV'] == '0']['region'].value_counts()/len(acsee[(acsee['DIV'] == '0') & (pd.notnull(acsee['region']))])

DAR ES SALAAM      0.203835
KILIMANJARO        0.072654
MWANZA             0.066599
PWANI              0.064581
MBEYA              0.064581
MARA               0.045409
TABORA             0.044400
IRINGA             0.043391
DODOMA             0.038345
MOROGORO           0.036327
TANGA              0.036327
KAGERA             0.035318
SHINYANGA          0.027245
ARUSHA             0.027245
KIGOMA             0.027245
GEITA              0.027245
RUVUMA             0.025227
NJOMBE             0.023209
RUKWA              0.016145
MJINI MAGHARIBI    0.015136
SIMIYU             0.014127
KUSINI PEMBA       0.012109
MANYARA            0.008073
SONGWE             0.008073
KASKAZINI PEMBA    0.007064
SINGIDA            0.006054
LINDI              0.003027
KATAVI             0.001009
Name: region, dtype: float64