In [1]:
import pandas as pd
import geopandas as gpd
import geohash
from geopy.distance import great_circle
import re
import numpy as np
import shapefile
import scipy.sparse

In [2]:
# shape file data obtained from https://datacatalog.worldbank.org/dataset/kenya-schools
# dataset includes primary and secondary school, combined from two different data sources
# note that primary school data has ward and sub-county info, but secondary school data has only county and lat/long marked up
#!wget https://energydata.info/dataset/2fda191d-c3c6-4002-8c82-daa02008a9e3/resource/849830e2-fcb5-4b42-8d33-e42c7c1e90b4/download/schools.zip
#!unzip -n schools.zip   

In [3]:
gdf_schools = gpd.read_file('Schools/Schools.shp').rename(
    columns={'SCHOOL_NAM': 'name', 'X_Coord': 'long', 'Y_Coord': 'lat'}).drop(columns=['OBJECTID', 'CODE'])
# add id column matching index, for convenience
gdf_schools['id'] = gdf_schools.index
gdf_schools

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id
0,BAKWANIN,Primary,Public,Baringo,BARINGO CENTRAL,KABASIS,Baringo Central,Sacho,35.797080,0.409550,"Ministry of Education, 2016",POINT (143417.238 10045338.886),0
1,BEKIBON,Primary,Public,Baringo,BARINGO CENTRAL,TENGES,Baringo South,Marigat,35.884060,0.336400,"Ministry of Education, 2016",POINT (153107.652 10037237.735),1
2,BOKORIN,Primary,Public,Baringo,BARINGO CENTRAL,KABARNET,Baringo Central,Kapropita,35.771770,0.532180,"Ministry of Education, 2016",POINT (140602.763 10058916.014),2
3,BOROWONIN,Primary,Public,Baringo,BARINGO CENTRAL,KABARNET,Baringo Central,Kapropita,35.778640,0.444870,"Ministry of Education, 2016",POINT (141363.771 10049249.854),3
4,BOSIN,Primary,Public,Baringo,BARINGO CENTRAL,KABASIS,Baringo Central,Sacho,35.795450,0.438090,"Ministry of Education, 2016",POINT (143236.887 10048498.462),4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37925,SALVATION ARMY NAWOITORONG SEC SCH,Secondary,,TURKANA,,,,KANAMKEMER,35.620267,3.113988,"Open Data Kenya, School 2007",POINT (124253.791 10344795.181),37925
37926,MOYALE ODDA MILITARY CAMP SEC,Secondary,,MARSABIT,,,,BUTIYE,39.090567,3.463152,"Open Data Kenya, School 2007",POINT (510059.563 10382787.508),37926
37927,OBBU SEC,Secondary,,MARSABIT,,,,SOLOLO,38.644848,3.546583,"Open Data Kenya, School 2007",POINT (460555.403 10392016.569),37927
37928,NAPATA REFUGEE SEC SCH,Secondary,,TURKANA,,,,LOPUR,34.835844,3.739390,"Open Data Kenya, School 2007",POINT (37206.754 10414419.116),37928


In [4]:
# reverse index for the schools, mapping school (name, lat, long) to the row id in the dataset
schools_map = {(t.name, t.lat, t.long): t.id for t in gdf_schools.itertuples()}

In [5]:
dups = gdf_schools.groupby(['name', 'lat', 'long']).size().to_frame().rename(columns={0: 'count'}).query('count > 1')

In [6]:
gdf_schools.set_index(['name', 'lat', 'long']).loc[dups.index, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,Source,geometry,id
name,lat,long,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ALDAI GIRLS,0.083521,35.075228,Primary,Public,Nandi,NANDI SOUTH,KAPTUMO NORTH,Aldai,Kaptumo/ Kaboi,"Ministry of Education, 2016",POINT (62927.351 10009253.428),16181
ALDAI GIRLS,0.083521,35.075228,Secondary,Public,Nandi,NANDI SOUTH,KAPTUMO NORTH,Aldai,Kaptumo/ Kaboi,"Ministry of Education, 2016",POINT (62927.351 10009253.428),24183
ALUOR GIRLS,-0.010453,34.466090,Primary,Public,Siaya,GEM,KAMBARE,Gem,South Gem,"Ministry of Education, 2016",POINT (-5041.530 9998840.945),18396
ALUOR GIRLS,-0.010453,34.466090,Secondary,Public,Siaya,GEM,KAMBARE,Gem,South Gem,"Ministry of Education, 2016",POINT (-5041.530 9998840.945),24743
AMABUKO,-0.766851,34.930703,Primary,Public,Kisii,MASABA SOUTH,KEROKA,Nyaribari Masaba,Ichuni,"Ministry of Education, 2016",POINT (46845.849 9915024.225),7766
...,...,...,...,...,...,...,...,...,...,...,...,...
VIGURUNGANI,-4.044033,39.167243,Secondary,Public,Kwale,KINANGO,KINANGO,Kinango,Puma,"Ministry of Education, 2016",POINT (518563.936 9553003.544),22382
WATUKA,-0.284747,36.765361,Primary,Public,Nyeri,KIENI WEST,ENDARASHA,Kieni,Gatarakwa,"Ministry of Education, 2016",POINT (251280.192 9968502.784),17778
WATUKA,-0.284747,36.765361,Secondary,Public,Nyeri,KIENI WEST,ENDARASHA,Kieni,Mwiyogo/Endarasha,"Ministry of Education, 2016",POINT (251280.192 9968502.784),24585
WELKIM ACADEMY,-1.265504,36.991380,Primary,,NAIROBI SOUTH,,,,RUAI,"Open Data Kenya, School 2007",POINT (276498.987 9860037.006),33599


In [7]:
# data downloaded from ishamba CustomerPlantVillage table. We are only interested in rows where customer has entered school name
data = pd.read_json('data_schools.json')
data.columns = ['id', 'county', 'county_raw', 'school', 'school_raw', 'school_recognized', 'lat', 'long', 'is_complete']
data['school_recognized'] = data['school_recognized'].astype('boolean')
data = data[~data.school_raw.isna()].copy()
data

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete
0,7,NANDI,Nandi,SAMOEI BOYS SECONDARY SCHOOL,Samoei boys,True,0.108394,35.169737,True
1,5,TAITA TAVETA,Taveta,,Maho secondary,False,,,True
2,4,NAROK,Narok,NAROK HIGH,Narok,True,-1.069555,35.864660,True
3,12,MOMBASA,mombasa,STAR OF THE SEA,star if the sea,True,-4.066130,39.669400,True
4,8,NAKURU,Nakuru,BAHATI PCEA GIRLS,Bahati Girls,True,-0.144056,36.169991,True
...,...,...,...,...,...,...,...,...,...
3592,3620,,kakamega,KAMASAI,kamasai,True,0.543860,34.879540,True
3593,3619,BUNGOMA,Bungoma,MALINDA SA,Malinda Sa,True,0.769722,34.501334,False
3594,3622,BUNGOMA,Bungoma,,bridge international academy,False,,,True
3595,3623,,Nakuru,ELDAMA RAVINE BOARDING PRI,Eldama Ravine day and boarding primary school,True,0.040400,35.722010,True


In [8]:
# where PV service got customer confirmation for county and school, map that back to row id in the schools dataset
data['school_id'] = [schools_map.get((t.school, t.lat, t.long)) for t in data.itertuples()]
data['school_id'] = data['school_id'].astype('Int64')
data

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id
0,7,NANDI,Nandi,SAMOEI BOYS SECONDARY SCHOOL,Samoei boys,True,0.108394,35.169737,True,24123
1,5,TAITA TAVETA,Taveta,,Maho secondary,False,,,True,
2,4,NAROK,Narok,NAROK HIGH,Narok,True,-1.069555,35.864660,True,24229
3,12,MOMBASA,mombasa,STAR OF THE SEA,star if the sea,True,-4.066130,39.669400,True,13744
4,8,NAKURU,Nakuru,BAHATI PCEA GIRLS,Bahati Girls,True,-0.144056,36.169991,True,23959
...,...,...,...,...,...,...,...,...,...,...
3592,3620,,kakamega,KAMASAI,kamasai,True,0.543860,34.879540,True,16066
3593,3619,BUNGOMA,Bungoma,MALINDA SA,Malinda Sa,True,0.769722,34.501334,False,1637
3594,3622,BUNGOMA,Bungoma,,bridge international academy,False,,,True,
3595,3623,,Nakuru,ELDAMA RAVINE BOARDING PRI,Eldama Ravine day and boarding primary school,True,0.040400,35.722010,True,30613


In [9]:
# Matcher based on tf-idf vectorization of character ngrams
from typing import Tuple, List, Iterable, Set
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

class Matcher:

    default_remove_regexp = "'"
    default_to_space_regexp = r'[^\w]+'

    def __init__(
        self,
        ngram_range: Tuple[int, int],
        df: pd.DataFrame,
        remove_regexp=default_remove_regexp,
        to_space_regexp=default_to_space_regexp,
        stop_words: Iterable[str] = None,
    ):
        self.df = df
        self.remove_regexp = remove_regexp
        self.to_space_regexp = to_space_regexp
        self.stop_words = set(stop_words) if stop_words else {}
        self.vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=ngram_range)
        corpus = self._clean(df['name'])
        self.X = self.vectorizer.fit_transform(corpus)

    @staticmethod
    def clean(
            s: pd.Series,
            remove_regexp=default_remove_regexp,
            to_space_regexp=default_to_space_regexp,
            stop_words: Set[str] = None
    ) -> pd.Series:
        if remove_regexp:
            s = s.str.replace(remove_regexp, '', regex=True)
        if to_space_regexp:
            s = s.str.replace(to_space_regexp, ' ', regex=True)
        s = s.str.lower().str.strip()
        if stop_words:
            s = s.str.split().apply(
                lambda l: [x for x in l if x not in stop_words]
            ).apply(
                lambda l: ' '.join(l)
            )
        return s

    def _clean(self, s: pd.Series):
        return self.clean(
            s,
            remove_regexp=self.remove_regexp,
            to_space_regexp=self.to_space_regexp,
            stop_words=self.stop_words
        )

    def match(self, vals: Iterable[str], n=5):
        """
        Searches for multiple school names, returning n top matches for each of them.
        The return value is a tuple of three 2x2 ndarrays: ids, scores, and names.
        Each of the arrays has a shape (n, len(vals)), with each column corresponding to
        one of the searched-for vals, and each row to one of the matches returned for that
        val. 
        """
        if not isinstance(vals, pd.Series):
            vals = pd.Series(vals)
        Y = self.vectorizer.transform(self._clean(vals))
        res = self.X.dot(Y.transpose()).toarray()
        ind = res.argsort(axis=0)[:-(n+1):-1, :]
        return ind, np.take_along_axis(res, ind, axis=0), self.df['name'].to_numpy()[ind]

    def match_df(self, val: str, n=5, distance_from: Tuple[int, int] = None) -> pd.DataFrame:
        """
        Searches for a single school name and returns nice dataframe with top n matches.
        The distance shown is distance in km from the top match, or from the `distance_from`
        geopoint provided as (lat, long) tuple.
        """
        vals = pd.Series([val])
        Y = self.vectorizer.transform(self._clean(vals))
        res = self.X.dot(Y.transpose()).toarray()
        res = res[:, 0]
        ind: np.ndarray = res.argsort()[-n:]
        ind = ind[::-1]
        df: pd.DataFrame = self.df.iloc[ind].copy()
        best_lat, best_long = distance_from if distance_from else df.iloc[0][['lat', 'long']]
        df['dist'] = [great_circle((t.lat, t.long), (best_lat, best_long)).km for t in df.itertuples()]
        df['score'] = res[ind]
        return df


In [10]:
# figure out stop words
from collections import Counter
import itertools
names = Matcher.clean(gdf_schools.name)
c = Counter(itertools.chain.from_iterable([s.split() for s in names]))
# count as stop word anything that shows up more than 100 times
stop_words, _ = zip(*itertools.takewhile(lambda t: t[1] > 100, c.most_common()))
stop_words = set(stop_words)
# add few more things which are not that common, but still not useful to match on
stop_words.update(['and', 'schools'])


In [11]:
m23sw = Matcher(ngram_range=(2,3), df=gdf_schools, stop_words=stop_words)
m234sw = Matcher(ngram_range=(2,4), df=gdf_schools, stop_words=stop_words)
m3sw = Matcher(ngram_range=(3,3), df=gdf_schools, stop_words=stop_words)
m23 = Matcher(ngram_range=(2,3), df=gdf_schools, stop_words=None)

## Sampling data that PV was unable to match

In [12]:
data.query('school_recognized == False')

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id
1,5,TAITA TAVETA,Taveta,,Maho secondary,False,,,True,
5,11,SIAYA,Siaya,,ugenya high school,False,,,True,
12,6,TRANS NZOIA,Nzoia,,Kitale,False,,,True,
13,14,NAIROBI,Nairobi,,none,False,,,True,
16,13,HOMA BAY,Honda Bay,,tangatanga,False,,,True,
...,...,...,...,...,...,...,...,...,...,...
3564,3373,LAIKIPIA,laikipia,,Ngenia primary school,False,,,True,
3570,3310,MAKUENI,Makueni,,Itetani primary and other schools.,False,,,True,
3580,3606,VIHIGA,vihiga,,sabatia primary school,False,,,True,
3591,3618,NYANDARUA,NYANDARUA,,Muungano primary,False,,,True,


#### Ngenia primary school - laikipia county

In [13]:
# Second and third match are probably the right ones (note the matching county). Note first 3 matches are all 100%.
m23sw.match_df('Ngenia primary school', n=5)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
21410,NGENIA BOYS SEC,Secondary,Public,Kiambu,LIMURU,LIMURU,Limuru,Limuru Central,36.62555,-1.12306,"Ministry of Education, 2016",POINT (235757.480 9875760.604),21410,0.0,1.0
22429,NGENIA SECONDARY,Secondary,Public,Laikipia,LAIKIPIA EAST,DAIGA,Laikipia North,Mugogodo East,37.199142,0.070608,"Ministry of Education, 2016",POINT (299576.519 10007808.188),22429,147.257368,1.0
10075,NGENIA,Primary,Public,Laikipia,LAIKIPIA EAST,DAIGA,Laikipia North,Mugogodo East,37.213385,0.109592,"Ministry of Education, 2016",POINT (301162.401 10012119.140),10075,151.85115,1.0
2585,NGENIARI,Primary,Public,Embu,EMBU EAST,RUNYENJES,Runyenjes,Kagaari South,37.570893,-0.477126,"Ministry of Education, 2016",POINT (340965.048 9947246.710),2585,127.303579,0.761774
20530,NGENGE SEC,Secondary,Public,Embu,MBEERE SOUTH,RWIKA,Mbeere South,Mbeti South,37.52514,-0.62591,"Ministry of Education, 2016",POINT (335876.491 9930795.032),20530,114.278402,0.59283


In [14]:
# matcher without stop words doesn't do as well, but still finds one match (#4)
m23.match_df('Ngenia primary school', n=5)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
19699,NGENY PRIMARY SCHOOL,Primary,Public,Uasin Gishu,WARENG,TULWET,Kesses,Tulwet/Chiyat,35.316187,0.852784,"Ministry of Education, 2016",POINT (89844.527 10094454.807),19699,0.0,0.799601
14797,MAKONGENI PRIMARY SCHOOL,Primary,Public,Nairobi,MAKADARA,BURU BURU,Makadara,Maringo/Hamza,36.850929,-1.299784,"Ministry of Education, 2016",POINT (260866.892 9856232.801),14797,293.956336,0.726464
35277,NYANGE PRIMARY SCHOOL,Primary,,LAIKIPIA,,,,TIGITHI,36.9881,-0.25844,"Open Data Kenya, School 2007",POINT (276081.835 9971416.875),35277,223.221161,0.67928
10075,NGENIA,Primary,Public,Laikipia,LAIKIPIA EAST,DAIGA,Laikipia North,Mugogodo East,37.213385,0.109592,"Ministry of Education, 2016",POINT (301162.401 10012119.140),10075,226.559546,0.675887
6855,KIRIMA PRIMARY SCHOOL,Primary,Public,Kirinyaga,KIRINYAGA CENTRAL,GAKOIGO,Kirinyaga Central,Kanyekini,37.290629,-0.556819,"Ministry of Education, 2016",POINT (309770.321 9938427.168),6855,269.752471,0.633585


#### Itetani primary and other schools. - Makueni county

In [15]:
# again, matcher with stop words does well. Note that match #3 is from a different county, but same name.
m23sw.match_df('Itetani primary and other schools.', n=5)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
22867,ITETANI BOYS HIGH SCHOOL,Secondary,Public,Makueni,MBOONI WEST,TULIMANI,Mbooni,Tulimani,37.38951,-1.57894,"Ministry of Education, 2016",POINT (320836.656 9825409.850),22867,0.0,0.728384
22868,ITETANI GIRLS,Secondary,Public,Makueni,MBOONI WEST,TULIMANI,Mbooni,Tulimani,37.389908,-1.581498,"Ministry of Education, 2016",POINT (320881.141 9825127.056),22868,0.287857,0.728384
10730,ITETANI,Primary,Public,Machakos,MATUNGULU,KYANZAVI,Matungulu,Matungulu North,37.27164,-1.11397,"Ministry of Education, 2016",POINT (307683.567 9876816.283),10730,53.336874,0.728384
11676,ITETANI,Primary,Public,Makueni,MBOONI WEST,TULIMANI,Mbooni,Tulimani,37.38188,-1.58978,"Ministry of Education, 2016",POINT (319988.546 9824210.556),11676,1.473819,0.728384
36423,MATETANI SEC SCH,Secondary,,MACHAKOS,,,,KANGUNDO CENTRAL,37.32477,-1.31355,"Open Data Kenya, School 2007",POINT (313610.792 9854750.525),36423,30.374875,0.499142


In [16]:
# matcher without stop words finds one out of 3 good matches. 
m23.match_df('Itetani primary and other schools.', n=5)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
36033,MOTHERLAND PRIMARY SCHOOL,Primary,,NYANDARUA,,,,WERU,36.35937,-0.15624,"Open Data Kenya, School 2007",POINT (206060.303 9982712.305),36033,0.0,0.552335
11676,ITETANI,Primary,Public,Makueni,MBOONI WEST,TULIMANI,Mbooni,Tulimani,37.38188,-1.58978,"Ministry of Education, 2016",POINT (319988.546 9824210.556),11676,195.787558,0.546001
10730,ITETANI,Primary,Public,Machakos,MATUNGULU,KYANZAVI,Matungulu,Matungulu North,37.27164,-1.11397,"Ministry of Education, 2016",POINT (307683.567 9876816.283),10730,147.070439,0.546001
12063,GATHER PRIMARY SCHOOL,Primary,Public,Mandera,MANDERA WEST,TAKABA,Mandera West,Gither,39.91907,3.68008,"Ministry of Education, 2016",POINT (602064.378 10406817.570),12063,581.754239,0.528037
35314,LITER PRIMARY SCHOOL,Primary,,WEST POKOT,,,,LOMUT,35.61584,1.28533,"Open Data Kenya, School 2007",POINT (123301.491 10142317.589),35314,180.358223,0.500211


#### ugenya high school - Siaya county

In [17]:
# This seems like a genuinely missing schools - not surprising, since we don't have high schools in our dataset.
# Per Google map, the Ugenya high school in Siaya county has lat long 0.2170083,34.246443
m23sw.match_df('ugenya high', n=5)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
946,MUGENYI,Primary,Public,Bomet,KONOIN,KIMULOT,Konoin,Kimulot,35.212268,-0.537645,"Ministry of Education, 2016",POINT (78229.351 9940443.092),946,0.0,0.517679
20666,OGENYA GIRLS,Secondary,Public,Homa Bay,RACHUONYO NORTH,PALA,Karachuonyo,North Karachuonyo,34.55106,-0.35013,"Ministry of Education, 2016",POINT (4452.002 9961182.361),20666,76.420334,0.513128
8421,OGENYA,Primary,Public,Kisumu,NYANDO,NYANG'ANDE,Nyando,Kabonyo/Kanyagwal,34.869722,-0.256759,"Ministry of Education, 2016",POINT (40006.811 9971546.017),8421,49.256891,0.513128
13209,RANGENYA,Primary,Public,Migori,MIGORI,ANJEGO,Suna East,Kakrao,34.466018,-1.0139,"Ministry of Education, 2016",POINT (-4970.741 9887579.362),13209,98.431367,0.512231
3397,RANGENYA,Primary,Public,Homa Bay,NDHIWA,NDHIWA,Ndhiwa,Kanyamwa Kosewe,34.38219,-0.745814,"Ministry of Education, 2016",POINT (-14364.129 9917294.847),3397,95.15315,0.512231


#### Kitale - Nzoia county

In [18]:
# Looks like we got it. Note however that this is a common name and there are >5 with 1.0 score, without taking county into account.
m23sw.match_df('Kitale', n=10)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
29401,HILL- KITALE,Secondary,Public,Trans Nzoia,TRANS NZOIA WEST,MILIMANI,Kiminini,Hospital,35.0383,0.994103,"Ministry of Education, 2016",POINT (58874.107 10110143.309),29401,0.0,1.0
1534,KITALE FYM,Primary,Public,Bungoma,BUNGOMA SOUTH,SANGALO,Kanduyi,Bukembe West,34.62621,0.57789,"Ministry of Education, 2016",POINT (12854.814 10064062.042),1534,65.124515,1.0
37740,FRIENDS SEC SCH KITALE,Secondary,,BUNGOMA,,,,BUKEMBE WEST,34.62612,0.57943,"Open Data Kenya, School 2007",POINT (12844.902 10064232.766),37740,65.009982,1.0
31278,KITALE ACADEMY PRI SCH,Primary,,TRANS NZOIA,,,,MATISI,35.000556,0.999072,"Open Data Kenya, School 2007",POINT (54665.202 10110698.933),31278,4.232532,1.0
26663,KITALE SCHOOL,Primary,Public,Trans Nzoia,TRANS NZOIA WEST,CENTRAL,Saboti,Saboti,35.003128,0.995929,"Ministry of Education, 2016",POINT (54951.648 10110350.356),26663,3.915632,1.0
37878,KITALE ACADEMY GIRLS SEC SCH,Secondary,,TRANS NZOIA,,,,MATISI,34.998286,0.996958,"Open Data Kenya, School 2007",POINT (54411.740 10110465.006),37878,4.460001,1.0
26733,KITALALE PREPARATORY,Primary,Private,Trans Nzoia,TRANS NZOIA WEST,KINYORO,Saboti,Kinyoro,34.912255,1.015099,"Ministry of Education, 2016",POINT (44818.558 10112487.089),26733,14.206577,0.863662
3472,KITAL,Primary,Public,Homa Bay,RACHUONYO NORTH,NYAKONGO,Karachuonyo,Central,34.60747,-0.37293,"Ministry of Education, 2016",POINT (10748.987 9958657.775),3472,159.377099,0.727811
27578,KITALE UNION,Primary,Public,Trans Nzoia,TRANS NZOIA WEST,BONDENI,Saboti,Tuwani,34.995479,1.029852,"Ministry of Education, 2016",POINT (54103.183 10114110.127),27578,6.202112,0.700071
26482,KITALE NDOGO,Primary,Public,Trans Nzoia,KWANZA,KWANZA,Kwanza,Kapomboi,34.965104,1.051358,"Ministry of Education, 2016",POINT (50718.416 10116497.415),26482,10.332221,0.689502


### Conclusion
From the small sample above, it seems like matcher will be able to find good matches in many cases where PV was unable to do it. It also seems that using stop words is somewhat helpful.

## Scoring the Matchers

The idea is to use the subset of PV data where customers have confirmed the school. Apply our matching algorithm to each of these data points, and look for the PV "confirmed match" among the matches our algorithm found. Ideally, we want the PV confirmed match to show up at or near the top of our best matches - we can use confirmed match's rank as measure of goodness. Ideally we want $rank = 1$. If $rank > 5$ we would not even show the desired school to the customer in the top 5 choices.

In [19]:
def score_matcher(m: Matcher, data: pd.DataFrame, n=5) -> pd.Series:
    ids, scores, _ = m.match(data.school_raw, n)
    ranks = pd.DataFrame(scores).rank(ascending=False)

    ret_scores = [2*n] * len(data)  # default score when desired match was not found
    desired_ids = data.school_id.to_numpy()
    # we subtract the desired id from all match ids; each 0 corresponds to finding the desired match
    # then we find those zeros, note their indexes, and fill out the rank of the match
    for row_ind, col_ind in np.argwhere((ids - desired_ids[np.newaxis, :]) == 0):
        ret_scores[col_ind] = ranks.iloc[row_ind, col_ind]

    return pd.Series(ret_scores)


In [20]:
data_known = data[~data.school_id.isna()].copy()
scores_m23sw = pd.Series(score_matcher(m23sw, data_known))
scores_m234sw = pd.Series(score_matcher(m234sw, data_known))
scores_m3sw = pd.Series(score_matcher(m3sw, data_known))
scores_m23 = pd.Series(score_matcher(m23, data_known))

In [21]:
scores_df = pd.DataFrame({'m23sw': scores_m23sw, 'm234sw': scores_m234sw, 'm3sw': scores_m3sw, 'm23': scores_m23})
scores_df.describe(percentiles=[.75, .775, .8, .825, .85, .875, .9])

Unnamed: 0,m23sw,m234sw,m3sw,m23
count,1819.0,1819.0,1819.0,1819.0
mean,3.108851,3.087136,3.062397,2.717977
std,3.237606,3.216982,3.188363,3.055034
min,1.0,1.0,1.0,1.0
50%,1.5,1.5,1.5,1.0
75%,3.0,3.0,3.0,2.0
77.5%,3.0,3.0,3.0,3.0
80%,3.0,3.0,3.0,3.0
82.5%,5.0,5.0,4.0,3.5
85%,10.0,10.0,10.0,5.0


The matcher that does not use stop words (m23) is the best by this metric. However, we know from looking at sample of customer input unmatched by PV that removing stop words helps locate the correct matches.

The reason that matcher without stop words performs slightly better by this metric is likely due to PV matcher not using stop words, and thus biasing choices shown to the customer towards the matches that contained stop words that customer typed. This bias was then propagated into the "confirmed matches".

Aside from that, all matchers seem to perform similarly. Importantly, in about 15% of the cases the confirmed match is not present at all in the top 5 matches! This is concerning, let's investigate.

In [22]:
# This is part of the data where "confirmed match" did not surface up in the top 5 choices by our matcher
data_known['m23sw'] = scores_m23sw.to_numpy()
data_known.query('m23sw == 10.0')

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id,m23sw
6,10,NAIROBI SOUTH,nairobi,GRANDMAK PRIMARY,bidii primary,True,-1.281340,36.952110,False,32442,10.0
15,15,NAIROBI WEST,Nairobi,MAGOSO PRIMARY,kihumbuini primary,True,-1.307890,36.792420,True,36047,10.0
44,38,NAROK,narok,OLOLULUNGA DAY,lulunga,True,-0.999134,35.663739,True,16673,10.0
53,91,MACHAKOS,machakos,MUMBUNI BOYS SEC,Mumbuni boys high school,True,-1.488434,37.263504,True,36318,10.0
75,71,MACHAKOS,Machakos,ST FRANCIS OF ASSISSI,St.Francis of Assis.,True,-1.295449,37.112777,True,10375,10.0
...,...,...,...,...,...,...,...,...,...,...,...
3569,3429,BARINGO,BARINGO,SOYMINING,EMINING,True,0.002200,35.593570,True,324,10.0
3574,3374,LAIKIPIA,laikipia,SALAMA PRI SCH,salama primary,True,0.130485,36.489768,True,30768,10.0
3578,3600,VIHIGA,Vihiga,EMABWI PRIMARY,Esaba primary,True,0.098961,34.585455,True,36089,10.0
3586,3613,NAKURU,nakuru,NDIBAI SEC,Nairobi sec,True,-0.500065,36.097987,True,23805,10.0


#### Bahati Girls, many counties

In [23]:
# Note that 16 schools here are all perfect matches. This means that without county being taken into account, there is a good chance that
# desired school would not make it into top 5. For example, note that no maches for Kisii, Trans Nzoia, Kilifi, Nyandarua, and Lamu counties
# are shown in top 5.
m23sw.match_df('Bahati Girls', n=20, distance_from=(-0.144056,36.169991))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
35359,BAHATI PRI SCH,Primary,,NAIROBI NORTH,,,,MARINGO/HAMZA,36.861486,-1.288527,"Open Data Kenya, School 2007",POINT (262041.132 9857478.914),35359,148.681115,1.0
37366,BAHATI GIRLS SECONDARY SCHOOL,Secondary,,NAKURU,,,,BAHATI,36.155143,-0.155329,"Open Data Kenya, School 2007",POINT (183308.771 9982810.150),37366,2.072952,1.0
23959,BAHATI PCEA GIRLS,Secondary,Public,Nakuru,NAKURU NORTH,BAHATI,Bahati,Bahati,36.169991,-0.144056,"Ministry of Education, 2016",POINT (184962.845 9984057.921),23959,0.0,1.0
26355,BAHATI,Primary,Public,Tana River,TANA DELTA,NORTH,Garsen,Garsen North,39.995223,-1.417926,"Ministry of Education, 2016",POINT (610715.318 9843252.457),26355,448.266842,1.0
23979,BAHATI BOYS HIGH,Secondary,Private,Nakuru,NAKURU NORTH,BAHATI,Bahati,Kabatini,36.144345,-0.223465,"Ministry of Education, 2016",POINT (182106.973 9975269.485),23979,9.278961,1.0
15542,BAHATI GIRLS BOARDING,Primary,Private,Nakuru,NAKURU NORTH,BAHATI,Bahati,Bahati,36.155984,-0.158132,"Ministry of Education, 2016",POINT (183402.555 9982499.951),15542,2.208079,1.0
15506,BAHATI PCEA,Primary,Public,Nakuru,NAKURU NORTH,BAHATI,Bahati,Bahati,36.169527,-0.141518,"Ministry of Education, 2016",POINT (184911.126 9984338.773),15506,0.286891,1.0
34753,BAHATI PCEA PRIMARY,Primary,,NAKURU,,,,BAHATI,36.169527,-0.141518,"Open Data Kenya, School 2007",POINT (184911.126 9984338.773),34753,0.286891,1.0
26115,BAHATI PRIMARY,Primary,Public,Wajir,WAJIR EAST,BARWAQ,Wajir East,Barwago,40.02708,1.75172,"Ministry of Education, 2016",POINT (614241.385 10193649.601),26115,477.838993,1.0
23958,BAHATI GIRLS,Secondary,Public,Nakuru,NAKURU NORTH,BAHATI,Bahati,Bahati,36.155742,-0.153857,"Ministry of Education, 2016",POINT (183375.485 9982973.062),23958,1.923039,1.0


#### bidii primary - Nairobi county

In [24]:
# In this case, customer type "bidii primary" but has subsequently selected "GRANDMAK PRIMARY" as the confirmed match.
# It's not clear how this happened. Perhaps PV service did not show correct school and customer accepted what was available.
# Or maybe customer made wrong input.
# Note that arguably correct choice is only 7.9km away from the confirmed choice.
# However, note also that there are 10 perfect choices, and that without taking county into account, we are playing rullet.
m23sw.match_df('bidii primary', n=11, distance_from=(-1.281340, 36.952110))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
26449,BIDII,Primary,Public,Trans Nzoia,KWANZA,BIDII,Kwanza,Bidii,35.015138,1.070921,"Ministry of Education, 2016",POINT (56301.556 10118657.816),26449,338.815867,1.0
14791,BIDII,Primary,Public,Nairobi,MAKADARA,BURU BURU,Makadara,Harambee,36.881399,-1.285046,"Ministry of Education, 2016",POINT (264257.568 9857865.799),14791,7.871538,1.0
28120,BIDII,Primary,Public,Uasin Gishu,ELDORET EAST,MEIBEKI,Moiben,Moiben,35.247542,0.92137,"Ministry of Education, 2016",POINT (82198.355 10102059.398),28120,309.695713,1.0
17598,BIDII,Primary,Public,Nyandarua,NYANDARUA WEST,GATHANJI,Ol Joro Orok,Gathanji,36.261455,-0.095858,"Ministry of Education, 2016",POINT (195152.054 9989392.615),17598,152.555683,1.0
34150,BIDII ACADEMY,Primary,,BUNGOMA,,,,MBAKALO,34.896221,0.742943,"Open Data Kenya, School 2007",POINT (42996.861 10082330.060),34150,320.810212,1.0
10190,BIDII,Primary,Private,Laikipia,LAIKIPIA WEST,SIPILI,Laikipia West,Ol-Moran,36.375755,0.416365,"Ministry of Education, 2016",POINT (207892.067 10046069.489),10190,199.357129,1.0
15636,BIDII,Primary,Private,Nakuru,NJORO,KIHINGO,Njoro,Mauche,36.262511,-0.098855,"Ministry of Education, 2016",POINT (195269.722 9989060.941),15636,152.208633,1.0
12738,BIDII,Primary,Public,Meru,TIGANIA EAST,THANGATHA,Tigania East,Thangatha,37.587632,-0.082017,"Ministry of Education, 2016",POINT (342822.937 9990931.890),12738,150.922165,1.0
31027,BIDII PRI SCH,Primary,,NAIROBI NORTH,,,,HARAMBEE,36.880379,-1.283137,"Open Data Kenya, School 2007",POINT (264143.887 9858076.842),31027,7.97664,1.0
31530,M.C.K. BIDII ACADEMY PRY SCH,Primary,,MERU,,,,THANGATHA,37.88937,0.1531,"Open Data Kenya, School 2007",POINT (376407.334 10016925.352),31530,190.52827,1.0


#### Mumbuni boys high school - Machakos county

In [25]:
# In this case, there are 8 perfect matches just within Machakos county. Even if we narrowed down by county, there is no guarantee that we
# include the desired match, although it is likely that one of the shown matches would be close enough for the customer to accept.
# However, without taking county into account, we run high risk of not showing any valid matches. For instance, consider
# lone schools with this name in Tharaka Nithi and Makueni counties
m23sw.match_df('Mumbuni boys high school', n=15, distance_from=(-1.488434, 37.263504))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
22309,MUMBUNI SEC,Secondary,Public,Kitui,MWINGI CENTRAL,CENTRAL,Mwingi West,Migwani,38.10163,-1.05132,"Ministry of Education, 2016",POINT (400046.507 9883783.042),22309,105.088203,1.0
9379,MUMBUNI,Primary,Public,Kitui,MWINGI CENTRAL,MWINGI,Mwingi West,Migwani,38.10127,-1.04943,"Ministry of Education, 2016",POINT (400006.389 9883991.959),9379,105.150151,1.0
35455,MUMBUNI PRIMARY,Primary,,MACHAKOS,,,,KALAMA,37.36143,-1.62465,"Open Data Kenya, School 2007",POINT (317716.013 9820352.963),35455,18.652033,1.0
35250,MUMBUNI PRIMARY,Primary,,MACHAKOS,,,,KATANGI,37.737364,-1.352925,"Open Data Kenya, School 2007",POINT (359527.697 9850424.309),35250,54.787462,1.0
22562,MUMBUNI BOYS,Secondary,Public,Machakos,MACHAKOS,MUMBUNI,Machakos Town,Mumbuni North,37.263785,-1.490057,"Ministry of Education, 2016",POINT (306837.909 9835227.481),22562,0.183153,1.0
30447,MUMBUNI PRI SCH,Primary,,MACHAKOS,,,,MBIUNI,37.46835,-1.27558,"Open Data Kenya, School 2007",POINT (329587.241 9858959.135),30447,32.843845,1.0
36318,MUMBUNI BOYS SEC,Secondary,,MACHAKOS,,,,MUMBUNI NORTH,37.263504,-1.488434,"Open Data Kenya, School 2007",POINT (306806.495 9835406.887),36318,0.0,1.0
8882,MUMBUNI,Primary,Public,Kitui,KITUI WEST,TULIA,Kitui West,Mutonguni,38.10325,-1.05125,"Ministry of Education, 2016",POINT (400226.762 9883790.832),8882,105.251503,1.0
36317,MUMBUNI GIRLS SEC,Secondary,,MACHAKOS,,,,MUMBUNI NORTH,37.261195,-1.488757,"Open Data Kenya, School 2007",POINT (306549.557 9835370.966),36317,0.259164,1.0
32966,MUMBUNI PRIMARY SCHOOL,Primary,,MACHAKOS,,,,MUMBUNI NORTH,37.263088,-1.500928,"Open Data Kenya, School 2007",POINT (306761.292 9834025.240),32966,1.390041,1.0


#### lulunga, narok county

In [26]:
# Slightly different story. In this case the way customer shortened the name put some other matches ahead of the correct match.
# However, taking the county into account would again fix the issue.
m23sw.match_df('lulunga', n=15, distance_from=(-0.999134, 35.663739))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,dist,score
8496,LUNGA,Primary,Public,Kisumu,SEME,OTWENYA,Seme,East Seme,34.54601,-0.05954,"Ministry of Education, 2016",POINT (3879.391 9993398.971),8496,162.360829,0.819216
18774,LUNGA,Primary,Public,Siaya,UGENYA,NYAHARWA,Ugenya,West Ugenya,34.115209,0.135973,"Ministry of Education, 2016",POINT (-44217.542 10015084.259),18774,213.488728,0.819216
22390,LUNGA LUNGA,Secondary,Public,Kwale,MSAMBWENI,LUNGALUNGA,Lunga Lunga,Vanga,39.119036,-4.542845,"Ministry of Education, 2016",POINT (513204.400 9497867.493),22390,549.996884,0.819216
1915,LUNG'A,Primary,Public,Busia,BUSIA,BUKHAYO SOUTH,Matayos,Matayos South,34.206776,0.386719,"Ministry of Education, 2016",POINT (-33981.123 10042895.107),1915,223.586898,0.819216
14562,LUNGALUNGA SDA EDUCATION,Primary,Private,Nairobi,MAKADARA,VIWANDA,Makadara,Viwandani,36.873013,-1.247807,"Ministry of Education, 2016",POINT (263320.624 9861983.904),14562,137.253537,0.745419
9905,LUNGALUNGA,Primary,Public,Kwale,MSAMBWENI,LUNGALUNGA,Lunga Lunga,Vanga,39.121939,-4.551736,"Ministry of Education, 2016",POINT (513526.258 9496884.653),9905,550.929054,0.745419
18170,LULU,Primary,Public,Samburu,SAMBURU NORTH,BARAGOI,Samburu North,Angata Nanyokie,36.754676,1.219961,"Ministry of Education, 2016",POINT (250143.577 10134946.824),18170,274.954954,0.723642
29145,LULU HIGH,Secondary,Private,Kwale,MSAMBWENI,DIANI,Msambweni,Gombato Bongwe,39.546828,-4.320131,"Ministry of Education, 2016",POINT (560677.448 9522464.833),29145,567.756062,0.723642
16673,OLOLULUNGA DAY,Primary,Public,Narok,NAROK SOUTH,OLOLULUNGA,Narok South,Ololulung'a,35.663739,-0.999134,"Ministry of Education, 2016",POINT (128602.524 9889376.848),16673,0.0,0.684901
24263,OLOLULUNGA,Secondary,Public,Narok,NAROK SOUTH,OLOLULUNGA,Narok South,Ololulung'a,35.66736,-1.002193,"Ministry of Education, 2016",POINT (129006.422 9889038.570),24263,0.527036,0.684901


### Conclusion
It seems that we need to ask customer about county and take that input into account - too often same or similar school names are used in multiple counties
and thus matching solely based on school name is not enough. However, first we need to evaluate county names used in the school dataset, and make sure they are standardized.

# County field in schools dataset

In [27]:
gdf_schools.groupby(by=gdf_schools.County.str.lower(), dropna=False).size()

County
baringo             769
bomet              1045
bungoma            1318
busia               553
elgeyo marakwet     502
embu                731
garissa             180
homa bay           1376
isiolo              129
kajiado             697
kakamega           1398
kericho             853
kiambu             1482
kilifi              731
kirinyaga           590
kisii              1545
kisumu              980
kitui              1639
kwale               463
laikipia            515
lamu                102
machakos           1446
makueni            1345
mandera             155
marsabit            159
meru               1454
migori             1045
mombasa             660
murang'a           1104
nairobi            1041
nairobi north       469
nairobi south       207
nairobi west        449
nakuru             1390
nandi               983
narok               741
nyamira             826
nyandarua           735
nyeri               838
samburu             165
siaya               917
taita tav

Few things to note:
 - 31 entries have no county set
 - Some entries have Nairobi sub-county (north/south/west), while some just state "Nairobi"
 - There are some slight differences in spelling ("tharaka-nithi" vs "tharaka nithi")
 - This list was forced to lowercase - there are additional differences in all caps vs first letter capitalized

To assess the consistency between the marked county and the school location (lat/long), we can use counties shapefile.
We use the dataset found here, since it has much higher resolution than the one we use in ishamba:
https://hub.arcgis.com/datasets/Esri-EA::gis-gisadmin-iebc-counties?geometry=-6.983%2C-7.359%2C82.797%2C7.976

In [28]:
#!mkdir -p counties/arcgis
#!cd counties/arcgis && curl --compressed 'https://prod-hub-indexer.s3.amazonaws.com/files/071bc497268b4643b68fcdbde2b13a7e/0/full/4326/071bc497268b4643b68fcdbde2b13a7e_0_full_4326.zip' > data.zip && unzip data.zip

In [29]:
# Note that we make sure to use the same CRS projection that was used in schools dataset
gdf_counties = gpd.read_file('counties/arcgis/GIS_GISADMIN_IEBC_counties.shp').rename(columns={'COUNTY_NAM': 'county'}).to_crs(gdf_schools.crs)
gdf_counties.head()

Unnamed: 0,FID,UNIT_AREA,UNIT_PERIM,DISTRICT,COUNT_,county,CODE,SHAPE_Leng,SHAPE_Area,geometry
0,1,0.69621,5.468653,Baringo,172.0,Baringo,30,6.533348,0.884748,"POLYGON ((142052.029 10183272.681, 142122.980 ..."
1,2,0.11007,2.030379,Bomet,109.0,Bomet,36,3.151607,0.193246,"POLYGON ((107391.541 9955787.802, 107892.876 9..."
2,3,0.161656,1.858956,Siaya,49.0,Siaya,41,2.959541,0.286025,"POLYGON ((-27463.364 10034163.846, -27402.887 ..."
3,4,0.167501,2.892534,Bungoma,108.0,Bungoma,39,3.198102,0.245166,"POLYGON ((11989.362 10122424.631, 11999.559 10..."
4,5,0.106271,2.197821,Kericho,106.0,Kericho,35,3.765482,0.209301,"POLYGON ((107386.433 9955798.869, 107372.272 9..."


In [30]:
# Now we compute the distance matrix between each county and each school. We'll use this to assign schools to counties, but also to find cases
# where a school is close to the border of two or more counties. All distances are in meters.

In [31]:
import multiprocessing

def dist(cg):
    return gdf_schools.distance(cg)
with multiprocessing.Pool() as pool:
    distances = np.array(pool.map(dist, gdf_counties.geometry))

In [32]:
distances.shape

(47, 37930)

In [33]:
# assign computed county to each record, by taking closest county 
gdf_schools['county_computed'] = gdf_counties.county[distances.argmin(axis=0)].to_numpy()
gdf_schools['distance_computed'] = distances.min(axis=0)
gdf_schools

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed
0,BAKWANIN,Primary,Public,Baringo,BARINGO CENTRAL,KABASIS,Baringo Central,Sacho,35.797080,0.409550,"Ministry of Education, 2016",POINT (143417.238 10045338.886),0,Baringo,0.0
1,BEKIBON,Primary,Public,Baringo,BARINGO CENTRAL,TENGES,Baringo South,Marigat,35.884060,0.336400,"Ministry of Education, 2016",POINT (153107.652 10037237.735),1,Baringo,0.0
2,BOKORIN,Primary,Public,Baringo,BARINGO CENTRAL,KABARNET,Baringo Central,Kapropita,35.771770,0.532180,"Ministry of Education, 2016",POINT (140602.763 10058916.014),2,Baringo,0.0
3,BOROWONIN,Primary,Public,Baringo,BARINGO CENTRAL,KABARNET,Baringo Central,Kapropita,35.778640,0.444870,"Ministry of Education, 2016",POINT (141363.771 10049249.854),3,Baringo,0.0
4,BOSIN,Primary,Public,Baringo,BARINGO CENTRAL,KABASIS,Baringo Central,Sacho,35.795450,0.438090,"Ministry of Education, 2016",POINT (143236.887 10048498.462),4,Baringo,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37925,SALVATION ARMY NAWOITORONG SEC SCH,Secondary,,TURKANA,,,,KANAMKEMER,35.620267,3.113988,"Open Data Kenya, School 2007",POINT (124253.791 10344795.181),37925,Turkana,0.0
37926,MOYALE ODDA MILITARY CAMP SEC,Secondary,,MARSABIT,,,,BUTIYE,39.090567,3.463152,"Open Data Kenya, School 2007",POINT (510059.563 10382787.508),37926,Marsabit,0.0
37927,OBBU SEC,Secondary,,MARSABIT,,,,SOLOLO,38.644848,3.546583,"Open Data Kenya, School 2007",POINT (460555.403 10392016.569),37927,Marsabit,0.0
37928,NAPATA REFUGEE SEC SCH,Secondary,,TURKANA,,,,LOPUR,34.835844,3.739390,"Open Data Kenya, School 2007",POINT (37206.754 10414419.116),37928,Turkana,0.0


Handful of schools have closest distance > 0 which means they don't fall into any county. However, the distance is generally quite small, and these seem to be schools on the country border.

In [34]:
gdf_schools.query('distance_computed > 0')['distance_computed'].describe()

count      15.000000
mean      364.876741
std       327.207352
min        72.691738
25%       120.152987
50%       271.568205
75%       537.056420
max      1289.874248
Name: distance_computed, dtype: float64

To compare the computed with the original county values, we neeed to standardize on county names. Given the different spellings in schools dataset, we'll standardize on names coming from the counties shapefile.

In [35]:
from fuzzywuzzy import process as fuzz_process
schools_county_names_map = {name: fuzz_process.extractOne(name, gdf_counties.county)[0] for name in gdf_schools.County.unique() if name}
pd.options.display.max_rows = 10
pd.DataFrame({'key': schools_county_names_map.keys(), 'value': schools_county_names_map.values()}).head(100)

Unnamed: 0,key,value
0,Baringo,Baringo
1,Bomet,Bomet
2,Bungoma,Bungoma
3,Busia,Busia
4,Elgeyo Marakwet,Elgeyo Marakwet
...,...,...
92,GARISSA,Garissa
93,ELGEYO MARAKWET,Elgeyo Marakwet
94,KWALE,Kwale
95,ISIOLO,Isiolo


In [36]:
# note that we use lowercase 'county' for the normalized county name
gdf_schools['county'] = gdf_schools.County.apply(lambda name: schools_county_names_map.get(name, ''))

In [37]:
# pick out the discrepancies. Also, add the distance column to the original county.
discrepancies = gdf_schools[(gdf_schools.county != gdf_schools.county_computed) & (gdf_schools.county != '')].copy()
discrepancies['distance_orig'] = discrepancies.apply(lambda t: t.geometry.distance(gdf_counties.set_index('county').loc[t.county, 'geometry']), axis=1)
discrepancies.sort_values(by='distance_orig', inplace=True)
discrepancies

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,distance_orig
28425,NGEGE PRIMARY,Primary,Public,Kisumu,KISUMU WEST,OTONGLO,Kisumu West,Central Kisumu,35.006470,-0.370600,"Ministry of Education, 2016",POINT (55266.689 9958937.185),28425,Kericho,0.0,Kisumu,0.593701
3646,KAROGO VICTORY ACADEMY,Primary,Private,Homa Bay,RACHUONYO SOUTH,NYANG'IELA,Kasipul,West Kasipul,34.686420,-0.599020,"Ministry of Education, 2016",POINT (19575.581 9933600.926),3646,Kisii,0.0,Homa Bay,1.052797
4734,EKATSOMBERO,Primary,Public,Kakamega,KHWISERO,EAST KHWISERO,Khwisero,Kisa East,34.663875,0.126698,"Ministry of Education, 2016",POINT (17034.670 10014044.357),4734,Vihiga,0.0,Kakamega,1.062374
32816,ST. MARY'S DUNDORI ACADEMY PRIMARY,Primary,,NAKURU,,,,DUNDORI,36.232189,-0.258902,"Open Data Kenya, School 2007",POINT (191894.397 9971349.901),32816,Nyandarua,0.0,Nakuru,1.372169
24336,NYARIACHO SEC,Secondary,Public,Nyamira,MASABA NORTH,GIRANGO,Kitutu Masaba,Gachuba,34.870869,-0.734189,"Ministry of Education, 2016",POINT (40167.808 9918637.417),24336,Kisii,0.0,Nyamira,2.957688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6379,MWANGAZA,Primary,Public,Kilifi,BAHARI (KILIFI),CHONYI,Kilifi South,Chasimba,37.581318,0.336191,"Ministry of Education, 2016",POINT (342122.705 10037170.737),6379,Isiolo,0.0,Kilifi,389375.929138
28352,KIFARU PRIMARY,Primary,Public,Nairobi,EMBAKASI,KAYOLE,Embakasi West,Umoja II,40.666016,-1.258361,"Ministry of Education, 2016",POINT (685367.711 9860853.912),28352,Garissa,0.0,Nairobi,396366.330247
6752,MIJOMBONI FELLOWSHIP,Primary,Private,Kilifi,MALINDI,WATAMU,Kilifi North,Dabaso,35.331591,-0.821983,"Ministry of Education, 2016",POINT (91558.777 9908958.346),6752,Bomet,0.0,Kilifi,498897.990973
26173,MNGAMA,Primary,Public,Taita Taveta,MWATATE,BURA,Mwatate,Chawia,34.707036,0.428531,"Ministry of Education, 2016",POINT (21863.341 10047499.745),26173,Kakamega,0.0,Taita Taveta,504792.589102


In [38]:
discrepancies.distance_orig.describe()

count       282.000000
mean      15074.393574
std       70990.889402
min           0.593701
25%          63.339698
50%         167.670623
75%        1240.592634
max      613694.912584
Name: distance_orig, dtype: float64

In [39]:
len(discrepancies) / len(gdf_schools)

0.007434748220406011

282 entries (0.7%) in the schools dataset feature county discrepancy, with >75% of then being less < 2km away from the marked up county. Investigation of a small sample of entries with small discrepancies showed that this is likely due to slight inaccuracy in the county border positions in the shapefile - in other words the schools dataset is correct.

However, investigation of a sample of schools with large discrepancies (e.g. 30km and above) showed variety of causes, including school mistaken for another school with a similar name, or school location seeming completely wrong. Given this, we'll take the following approach:
 1. any schools where computed county is different from marked up county (in schools dataset), but with school location being within 5km of the marked up county, will be preserved as is (the county from the schools dataset will be used for the purpose of matching and will be shown to the customer)
 1. any schools with discrepancy > 5km will be removed from the dataset
 

In [40]:
# brief look at the schools without county marked in the schools dataset
gdf_schools[gdf_schools.county == ''][['name', 'lat', 'long', 'county_computed']]

Unnamed: 0,name,lat,long,county_computed
29611,AMANI ACADEMY,-0.756917,35.071082,Nyamira
29704,GENGA DOK PRI,-0.637399,34.643327,Kisii
29969,SERWET PRI SCH,-0.854350,35.044550,Bomet
30778,SOY PRECIOUS HOPE ACADEMY PRI SCH,0.686049,35.155891,Uasin Gishu
30789,NYANSORE ACADEMY PRI,-0.776781,34.626739,Kisii
...,...,...,...,...
35159,MALKAMARI PRI SCH,4.239490,40.695390,Mandera
35963,AGAPE PRI,0.466177,34.100797,Busia
36021,MOYALE JUNIOR ACADEMY PRI,3.525868,39.060895,Marsabit
36103,OMAR FARUQ MADRASA,3.949480,41.853330,Mandera


Google map search on a sample of schools:
 - found NYANSORE ACADEMY, good location and county, but extremely close to county border
 - found SERWET PRI SCH, good location and county, but extremely close to county border
 - found MOYALE JUNIOR ACADEMY, good location and county, extremely close to country border (Ethiopia)
 
Given the above, we will fill in the county for the 31 schools with the missing county, rather than throw those schools away.
 

## New Matcher Version with Filtering by County

In [41]:
class Matcher:

    default_remove_regexp = "'"
    default_to_space_regexp = r'[^\w]+'

    def __init__(
        self,
        ngram_range: Tuple[int, int],
        df: pd.DataFrame,
        counties: Iterable[str],
        county_school_matrix: scipy.sparse.spmatrix = None,
        remove_regexp=default_remove_regexp,
        to_space_regexp=default_to_space_regexp,
        stop_words: Iterable[str] = None,
    ):
        self.df = df
        self.remove_regexp = remove_regexp
        self.to_space_regexp = to_space_regexp
        self.stop_words = set(stop_words) if stop_words else {}
        self.vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=ngram_range)
        counties = np.array(counties)
        self.counties_name_to_id = {county_name: i for i, county_name in enumerate(counties)}
        corpus = self._clean(df['name'])
        self.X = self.vectorizer.fit_transform(corpus)
        if county_school_matrix is None:
            county_school_matrix = scipy.sparse.csr_matrix(
                df.county.to_numpy()[np.newaxis, :] == counties[:, np.newaxis])
        self.county_school_matrix = county_school_matrix

    @staticmethod
    def clean(
            s: pd.Series,
            remove_regexp=default_remove_regexp,
            to_space_regexp=default_to_space_regexp,
            stop_words: Set[str] = None
    ) -> pd.Series:
        if remove_regexp:
            s = s.str.replace(remove_regexp, '', regex=True)
        if to_space_regexp:
            s = s.str.replace(to_space_regexp, ' ', regex=True)
        s = s.str.lower().str.strip()
        if stop_words:
            s = s.str.split().apply(
                lambda l: [x for x in l if x not in stop_words]
            ).apply(
                lambda l: ' '.join(l)
            )
        return s

    def _clean(self, s: pd.Series):
        return self.clean(
            s,
            remove_regexp=self.remove_regexp,
            to_space_regexp=self.to_space_regexp,
            stop_words=self.stop_words
        )

    def _match(self, vals: Iterable[str], counties: Iterable[str] = None):
        if not isinstance(vals, pd.Series):
            vals = pd.Series(vals)
        if counties is not None and not isinstance(counties, pd.Series):
            counties = pd.Series(counties)
        Y = self.vectorizer.transform(self._clean(vals))
        ret: scipy.sparse.spmatrix = self.X.dot(Y.transpose())
        if counties is not None:
            county_ids = [self.counties_name_to_id[county_name] for county_name in counties]
            ret = ret.multiply(self.county_school_matrix[county_ids].transpose())
        return ret.toarray()

    def match(
            self,
            vals: Iterable[str],
            n=5,
            counties: Iterable[str] = None
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Searches for multiple school names, returning n top matches for each of them.
        The return value is a tuple of three 2x2 ndarrays: ids, scores, and names.
        Each of the arrays has a shape (n, len(vals)), with each column corresponding to
        one of the searched-for vals, and each row to one of the matches returned for that
        val.

        If counties is passed in, it must be an Iterable of the same length as vals, with desired
        county names corresponding to those used to train the model.
        """
        res = self._match(vals, counties)
        ind = res.argsort(axis=0)[:-(n+1):-1, :]
        return ind, np.take_along_axis(res, ind, axis=0), self.df['name'].to_numpy()[ind]

    def match_df(
            self,
            val: str,
            n=5,
            distance_from: Tuple[int, int] = None,
            county: str = None
    ) -> pd.DataFrame:
        """
        Searches for a single school name and returns nice dataframe with top n matches.
        The distance shown is distance in km from the top match, or from the `distance_from`
        geopoint provided as (lat, long) tuple.
        """
        res = self._match([val], [county] if county is not None else None)
        res = res[:, 0]
        ind: np.ndarray = res.argsort()[-n:]
        ind = ind[::-1]
        df: pd.DataFrame = self.df.iloc[ind].copy()
        best_lat, best_long = distance_from if distance_from else df.iloc[0][['lat', 'long']]
        df['dist'] = [great_circle((t.lat, t.long), (best_lat, best_long)).km for t in df.itertuples()]
        df['score'] = res[ind]
        return df


In [42]:
m23sw = Matcher(ngram_range=(2,3), df=gdf_schools, counties=gdf_counties.county, stop_words=stop_words)
m234sw = Matcher(ngram_range=(2,4), df=gdf_schools, counties=gdf_counties.county, stop_words=stop_words)
m3sw = Matcher(ngram_range=(3,3), df=gdf_schools, counties=gdf_counties.county, stop_words=stop_words)
m23 = Matcher(ngram_range=(2,3), df=gdf_schools, counties=gdf_counties.county, stop_words=None)

In [43]:
def score_matcher2(m: Matcher, data: pd.DataFrame, n=5) -> pd.Series:
    ids, scores, _ = m.match(data.school_raw, n, counties=data.county)
    ranks = pd.DataFrame(scores).rank(ascending=False)

    ret_scores = [2*n] * len(data)  # default score when desired match was not found
    desired_ids = data.school_id.to_numpy()
    # we subtract the desired id from all match ids; each 0 corresponds to finding the desired match
    # then we find those zeros, note their indexes, and fill out the rank of the match
    for row_ind, col_ind in np.argwhere((ids - desired_ids[np.newaxis, :]) == 0):
        ret_scores[col_ind] = ranks.iloc[row_ind, col_ind]

    return pd.Series(ret_scores)


In [44]:
# before we can score the new matcher, we need to normalize the confirmed county to our new adopted county names standard
data_known2 = data_known[~data_known.county.isna()].copy()
data_known2['county'] = data_known2.county.apply(lambda name: fuzz_process.extractOne(name, gdf_counties.county)[0])
data_known2

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id,m23sw
0,7,Nandi,Nandi,SAMOEI BOYS SECONDARY SCHOOL,Samoei boys,True,0.108394,35.169737,True,24123,2.5
2,4,Narok,Narok,NAROK HIGH,Narok,True,-1.069555,35.864660,True,24229,1.5
3,12,Mombasa,mombasa,STAR OF THE SEA,star if the sea,True,-4.066130,39.669400,True,13744,2.0
4,8,Nakuru,Nakuru,BAHATI PCEA GIRLS,Bahati Girls,True,-0.144056,36.169991,True,23959,3.0
6,10,Nairobi,nairobi,GRANDMAK PRIMARY,bidii primary,True,-1.281340,36.952110,False,32442,10.0
...,...,...,...,...,...,...,...,...,...,...,...
3588,3614,Homa Bay,Homa Bay,WACHARA,Wachara,True,-0.812414,34.306304,True,3411,2.5
3589,3616,Homa Bay,Homabay,LORATENG,Loorateng,True,-0.603800,34.529400,True,3054,1.0
3590,3621,Uasin Gishu,Uasin Gishu,MATUNDA RC,Matunda rc,True,0.844360,35.136296,True,19551,3.0
3593,3619,Bungoma,Bungoma,MALINDA SA,Malinda Sa,True,0.769722,34.501334,False,1637,3.0


In [45]:
scores_m23sw = pd.Series(score_matcher2(m23sw, data_known2))
scores_m234sw = pd.Series(score_matcher2(m234sw, data_known2))
scores_m3sw = pd.Series(score_matcher2(m3sw, data_known2))
scores_m23 = pd.Series(score_matcher2(m23, data_known2))

In [46]:
scores_df = pd.DataFrame({'m23sw': scores_m23sw, 'm234sw': scores_m234sw, 'm3sw': scores_m3sw, 'm23': scores_m23})
scores_df.describe(percentiles=[.85, .875, .9, .925, .95])

Unnamed: 0,m23sw,m234sw,m3sw,m23
count,1805.000000,1805.000000,1805.000000,1805.000000
mean,2.337119,2.327978,2.308310,1.786704
std,2.678662,2.661089,2.629851,1.972072
min,1.000000,1.000000,1.000000,1.000000
50%,1.500000,1.500000,1.500000,1.000000
...,...,...,...,...
87.5%,3.000000,3.000000,3.000000,2.000000
90%,10.000000,10.000000,5.000000,3.000000
92.5%,10.000000,10.000000,10.000000,3.850000
95%,10.000000,10.000000,10.000000,5.000000


Mean scores have improved significantly. Let's look more closely at percentile at which the desired match disappears from the top 5.

In [47]:
data_known2['m23sw_without_county'] = data_known['m23sw']
data_known2['m23sw'] = scores_m23sw.to_numpy()
(data_known2['m23sw'] < 10).mean(),  (data_known2['m23sw_without_county'] < 10).mean()

(0.8958448753462603, 0.8254847645429363)

Now the difference seems more obvious - we went from having the desired top 5 match 83% of the time to 90% of the time.
Let's sample some of the cases where the desired match did not surface in top 5.

In [48]:
# This is part of the data where "confirmed match" did not surface up in the top 5 choices by our matcher
pd.options.display.min_rows = 20
data_known2.query('m23sw == 10.0')

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id,m23sw,m23sw_without_county
6,10,Nairobi,nairobi,GRANDMAK PRIMARY,bidii primary,True,-1.281340,36.952110,False,32442,10.0,10.0
15,15,Nairobi,Nairobi,MAGOSO PRIMARY,kihumbuini primary,True,-1.307890,36.792420,True,36047,10.0,10.0
83,115,Nyeri,Nyeri,NAROMORU BOYS,Karima boys,True,-0.129194,37.051321,True,24555,10.0,10.0
131,80,Nakuru,Nakuru,KIPTORORO PRIMARY SCHOOL,Kipsyenan primary school,True,-0.377160,35.503790,True,15162,10.0,10.0
150,136,Kisumu,KISUMU.,MUSLIM SECONDARY SCHOOL,KIBIGORI SECONDARY SCHOOL.,True,-0.092187,34.763203,True,25869,10.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3552,3583,Trans Nzoia,TRANSNZOIA,NAMANDA PRIMARY SCHOOL,Amka pri. school,True,0.828798,34.852002,False,29948,10.0,10.0
3557,3428,Nakuru,Nakuru,NAKURU EAST,NAKURU,True,-0.287809,36.149345,True,15443,10.0,10.0
3578,3600,Vihiga,Vihiga,EMABWI PRIMARY,Esaba primary,True,0.098961,34.585455,True,36089,10.0,10.0
3586,3613,Nakuru,nakuru,NDIBAI SEC,Nairobi sec,True,-0.500065,36.097987,True,23805,10.0,10.0


In [49]:
m23sw.match_df('migunde primary', n=5, county='Homa Bay', distance_from=(-0.642830, 34.449650))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
3500,MIGUNDE,Primary,Public,Homa Bay,RACHUONYO NORTH,GOT OYARO,Karachuonyo,North Karachuonyo,34.5188,-0.34753,"Ministry of Education, 2016",POINT (851.200 9961468.908),3500,Homa Bay,0.0,Homa Bay,33.724104,1.0
3498,MIGENI,Primary,Public,Homa Bay,RACHUONYO NORTH,HOMA,Karachuonyo,West Karachuonyo,34.488,-0.40752,"Ministry of Education, 2016",POINT (-2583.200 9954815.816),3498,Homa Bay,0.0,Homa Bay,26.510501,0.383533
29936,MIGERA ACADEMY PRY SCH,Primary,,HOMA BAY,,,,KABUOCH NORTH,34.51553,-0.752197,"Open Data Kenya, School 2007",POINT (519.984 9916602.573),29936,Homa Bay,0.0,Homa Bay,14.19673,0.36878
3499,MIGINGO,Primary,Public,Homa Bay,RACHUONYO NORTH,NYAKONGO,Karachuonyo,Central,34.62181,-0.39813,"Ministry of Education, 2016",POINT (12350.792 9955865.009),3499,Homa Bay,0.0,Homa Bay,33.268459,0.356858
3721,ORINDE,Primary,Public,Homa Bay,RACHUONYO SOUTH,RINGA,Kabondo Kasipul,Kojwach,34.79884,-0.51282,"Ministry of Education, 2016",POINT (32112.465 9943164.226),3721,Homa Bay,0.0,Homa Bay,41.43025,0.316959


In [50]:
m23sw.match_df('Matili', n=5, county='Trans Nzoia', distance_from=(1.222506, 34.824607))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
26680,MATISI PRIMARY SCHOOL,Primary,Public,Trans Nzoia,TRANS NZOIA WEST,CENTRAL,Saboti,Matisi,34.975998,1.040638,"Ministry of Education, 2016",POINT (51931.939 10115308.012),26680,Trans Nzoia,0.0,Trans Nzoia,26.310328,0.449619
27741,MATISI,Secondary,Public,Trans Nzoia,TRANS NZOIA WEST,CENTRAL,Saboti,Matisi,34.975486,1.04073,"Ministry of Education, 2016",POINT (51874.847 10115318.280),27741,Trans Nzoia,0.0,Trans Nzoia,26.266079,0.449619
32342,CHEPTILIL PRI SCH,Primary,,TRANS NZOIA,,,,SABOTI,34.827618,0.95096,"Open Data Kenya, School 2007",POINT (35368.482 10105390.904),32342,Trans Nzoia,0.0,Trans Nzoia,30.196436,0.390778
27722,ST PETERS MITO MBILI GIRLS,Secondary,Public,Trans Nzoia,TRANS NZOIA EAST,KACHIBORA,Cherangany,Cherangany/ Suwerwa,35.238807,0.981418,"Ministry of Education, 2016",POINT (81231.648 10108711.969),27722,Trans Nzoia,0.0,Trans Nzoia,53.283375,0.330496
26598,MITO MBILI,Primary,Public,Trans Nzoia,TRANS NZOIA EAST,KACHIBORA,Cherangany,Cherangany/ Suwerwa,35.245904,1.003217,"Ministry of Education, 2016",POINT (82025.707 10111125.743),26598,Trans Nzoia,0.0,Trans Nzoia,52.804397,0.330496


In [51]:
m23sw.match_df('Karima boys', n=5, county='Nyeri', distance_from=(-0.129194, 37.051321))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
18046,KARIMA,Primary,Public,Nyeri,NYERI SOUTH,KARIMA,Othaya,Karima,36.978933,-0.521668,"Ministry of Education, 2016",POINT (275068.146 9942303.865),18046,Nyeri,0.0,Nyeri,44.377242,1.0
24659,KARIMA BOYS,Secondary,Public,Nyeri,NYERI SOUTH,KARIMA,Othaya,Karima,36.982371,-0.517915,"Ministry of Education, 2016",POINT (275450.800 9942719.066),24659,Nyeri,0.0,Nyeri,43.898539,1.0
18077,CONSOLATA KARIMA,Primary,Private,Nyeri,NYERI SOUTH,KARIMA,Mathira,Iriaini,36.987167,-0.51891,"Ministry of Education, 2016",POINT (275984.820 9942609.190),18077,Nyeri,0.0,Nyeri,43.917715,0.538511
24603,KIRIMARA HIGH,Secondary,Public,Nyeri,MATHIRA EAST,MATHIRA EAST,Mathira,Konyu,37.133671,-0.500796,"Ministry of Education, 2016",POINT (292295.269 9944617.450),24603,Nyeri,0.0,Nyeri,42.322745,0.491025
17978,KIRIMARA,Primary,Public,Nyeri,NYERI CENTRAL,MUNICIPALITY NORTH,Nyeri Town,Gatitu/ Muruguru,37.133357,-0.501045,"Ministry of Education, 2016",POINT (292260.264 9944589.859),17978,Nyeri,0.0,Nyeri,42.342242,0.491025


In [52]:
m23sw.match_df('Kipsyenan primary school', n=5, county='Nakuru', distance_from=(-0.377160, 35.503790))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
24027,KIPSYENAN SEC,Secondary,Public,Nakuru,RONGAI,KAMPI YA MOTO,Rongai,Soin,35.934613,-0.095238,"Ministry of Education, 2016",POINT (158735.647 9989458.143),24027,Nakuru,0.0,Nakuru,57.250346,1.0
15669,KIPSYENAN,Primary,Public,Nakuru,RONGAI,MAIKONGENI,Rongai,Soin,35.934142,-0.091901,"Ministry of Education, 2016",POINT (158683.184 9989827.520),15669,Nakuru,0.0,Nakuru,57.410707,1.0
15581,KENANA,Primary,Public,Nakuru,NJORO,NJORO CENTRAL,Njoro,Njoro,35.891144,-0.303215,"Ministry of Education, 2016",POINT (153895.969 9966435.900),15581,Nakuru,0.0,Nakuru,43.8489,0.31161
15430,LENANA,Primary,Public,Nakuru,NAKURU,BARUT,Nakuru Town East,Menengai,36.077851,-0.279944,"Ministry of Education, 2016",POINT (174699.834 9969017.217),15430,Nakuru,0.0,Nakuru,64.740569,0.291122
15156,KIPSAPTA,Primary,Public,Nakuru,KURESOI,TEMOYETTA,Kuresoi North,Kiptororo,35.53145,-0.39651,"Ministry of Education, 2016",POINT (113808.880 9956092.753),15156,Nakuru,0.0,Nakuru,3.753494,0.288422


In [53]:
m23sw.match_df('KIBIGORI SECONDARY SCHOOL.', n=5, county='Kisumu', distance_from=(-0.092187, 34.763203))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
8133,KIBIGORI,Primary,Public,Kisumu,MUHORONI,MIWANI,Muhoroni,Chemelil,35.04475,-0.07503,"Ministry of Education, 2016",POINT (59527.737 9991686.997),8133,Kisumu,0.0,Kisumu,31.364683,1.0
22020,KIBIGORI MIXED SEC.,Secondary,Public,Kisumu,MUHORONI,CHEMELIL,Muhoroni,Chemelil,35.04475,-0.07503,"Ministry of Education, 2016",POINT (59527.737 9991686.997),22020,Kisumu,0.0,Kisumu,31.364683,1.0
8134,KIBIGORI RAILWAY,Primary,Public,Kisumu,MUHORONI,MIWANI,Muhoroni,Chemelil,35.0497,-0.07045,"Ministry of Education, 2016",POINT (60079.826 9992194.489),8134,Kisumu,0.0,Kisumu,31.948587,0.672409
8003,KIBOS,Primary,Public,Kisumu,KISUMU EAST,RAGUMO,Kisumu East,Kolwa Central,34.803737,-0.067823,"Ministry of Education, 2016",POINT (32640.497 9992483.234),8003,Kisumu,0.0,Kisumu,5.258724,0.262512
22059,NDORI B.C. MIXED SEC. SCH,Secondary,Public,Kisumu,NYAKACH,SIGOTI,Nyakach,South East Nyakach,34.94913,-0.35894,"Ministry of Education, 2016",POINT (48869.873 9960226.306),22059,Kisumu,0.0,Kisumu,36.155576,0.262306


In [54]:
m23sw.match_df('Amka pri. school', n=5, county='Trans Nzoia', distance_from=(0.828798, 34.852002))

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
26446,AMUKA,Primary,Public,Trans Nzoia,KWANZA,KAISAGAT,Kwanza,Kwanza,35.022539,1.090907,"Ministry of Education, 2016",POINT (57129.868 10120871.169),26446,Trans Nzoia,0.0,Trans Nzoia,34.769724,0.397261
37901,AMUKA SEC SCH,Secondary,,TRANS NZOIA,,,,KAPOMBOI,35.022122,1.089606,"Open Data Kenya, School 2007",POINT (57083.171 10120727.082),37901,Trans Nzoia,0.0,Trans Nzoia,34.623204,0.397261
26445,AMANI,Primary,Public,Trans Nzoia,KWANZA,CHEPCHOINA,Endebess,Chepchoina,35.18657,1.12211,"Ministry of Education, 2016",POINT (75426.095 10124304.036),26445,Trans Nzoia,0.0,Trans Nzoia,49.470565,0.290294
26552,AMANI,Primary,Public,Trans Nzoia,TRANS NZOIA EAST,MAKUTANO,Cherangany,Kaplamai,35.187482,1.11944,"Ministry of Education, 2016",POINT (75527.355 10124008.117),26552,Trans Nzoia,0.0,Trans Nzoia,49.351935,0.290294
26641,AMAGORO,Primary,Public,Trans Nzoia,TRANS NZOIA WEST,WAITALUK,Kiminini,Waitaluk,35.079398,0.925134,"Ministry of Education, 2016",POINT (63448.714 10102496.636),26641,Trans Nzoia,0.0,Trans Nzoia,27.458081,0.213864


In [55]:
m23sw.match_df('Cheptobot', n=5, county='Trans Nzoia')

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
26568,CHEPTOBOT,Primary,Public,Trans Nzoia,TRANS NZOIA EAST,CHEPSIRO,Cherangany,Cherangany/ Suwerwa,35.2971,1.033933,"Ministry of Education, 2016",POINT (87737.742 10114521.428),26568,Trans Nzoia,0.0,Trans Nzoia,0.0,1.0
37864,CHEPTIR SEC SCH,Secondary,,TRANS NZOIA,,,,MOTOSIET,35.188915,0.953224,"Open Data Kenya, School 2007",POINT (75664.997 10105595.032),37864,Trans Nzoia,0.0,Trans Nzoia,15.006977,0.433921
27724,ST. JOSEPHS MIXED DAY CHEPTIL,Secondary,Public,Trans Nzoia,TRANS NZOIA EAST,CHEPSIRO,Cherangany,Motosiet,34.972358,0.978433,"Ministry of Education, 2016",POINT (51517.372 10108415.861),27724,Trans Nzoia,0.0,Trans Nzoia,36.627784,0.428238
28214,CHEPTIRET PRIMARY,Primary,Public,Trans Nzoia,TRANS NZOIA EAST,KACHIBORA,Cherangany,Cherangany/ Suwerwa,34.87124,0.876,"Ministry of Education, 2016",POINT (40225.136 10097077.997),28214,Trans Nzoia,0.0,Trans Nzoia,50.498863,0.421587
32342,CHEPTILIL PRI SCH,Primary,,TRANS NZOIA,,,,SABOTI,34.827618,0.95096,"Open Data Kenya, School 2007",POINT (35368.482 10105390.904),32342,Trans Nzoia,0.0,Trans Nzoia,53.005392,0.36717


### Conclusion
In all sampled cases, it seems like our algorithm would produce high quality match. It seems likely that PV's algorithm did not produce good match and so customer was forced to pick among the offered choices; or alternatively that customer made wrong selection (less likely).

## County name matching

Let's look at how well we can match county names based on customer input. Since PV service used slightly different spelling for few county names, we'll convert those to our new names first.

In [56]:
data_county_known = data[~data.county.isna()].copy()
data_county_known['county2'] = data_county_known.county.apply(lambda c: fuzz_process.extractOne(c, gdf_counties.county)[0])
data_county_known['county_matched'], data_county_known['ratio'], _ = zip(*data_county_known.county_raw.apply(lambda c: fuzz_process.extractOne(c, gdf_counties.county)))

In [57]:
data_county_known

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id,county2,county_matched,ratio
0,7,NANDI,Nandi,SAMOEI BOYS SECONDARY SCHOOL,Samoei boys,True,0.108394,35.169737,True,24123,Nandi,Nandi,100
1,5,TAITA TAVETA,Taveta,,Maho secondary,False,,,True,,Taita Taveta,Taita Taveta,90
2,4,NAROK,Narok,NAROK HIGH,Narok,True,-1.069555,35.864660,True,24229,Narok,Narok,100
3,12,MOMBASA,mombasa,STAR OF THE SEA,star if the sea,True,-4.066130,39.669400,True,13744,Mombasa,Mombasa,100
4,8,NAKURU,Nakuru,BAHATI PCEA GIRLS,Bahati Girls,True,-0.144056,36.169991,True,23959,Nakuru,Nakuru,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3590,3621,UASIN GISHU,Uasin Gishu,MATUNDA RC,Matunda rc,True,0.844360,35.136296,True,19551,Uasin Gishu,Uasin Gishu,100
3591,3618,NYANDARUA,NYANDARUA,,Muungano primary,False,,,True,,Nyandarua,Nyandarua,100
3593,3619,BUNGOMA,Bungoma,MALINDA SA,Malinda Sa,True,0.769722,34.501334,False,1637,Bungoma,Bungoma,100
3594,3622,BUNGOMA,Bungoma,,bridge international academy,False,,,True,,Bungoma,Bungoma,100


In [58]:
data_county_known.ratio.describe(percentiles=[.75, .85, .95, .98, .99])

count    2635.000000
mean       98.439848
std         3.912056
min        59.000000
50%       100.000000
            ...     
85%       100.000000
95%       100.000000
98%       100.000000
99%       100.000000
max       100.000000
Name: ratio, Length: 11, dtype: float64

In [59]:
data_county_known[data_county_known.county2 != data_county_known.county_matched]

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id,county2,county_matched,ratio
440,252,KIRINYAGA,Embu and kirinyaga,,ngurubani,False,,,True,,Kirinyaga,Embu,90
1115,1089,NAIROBI NORTH,Kajiado north,,Primary,False,,,True,,Nairobi,Kajiado,90
1310,279,KERICHO,Keringet,KERINGET,Keringet estate,True,-0.10039,35.60881,True,5568.0,Kericho,Kirinyaga,59
3412,3589,NAIROBI NORTH,Laikipia North,,Miamoja primary school,False,,,False,,Nairobi,Laikipia,90


In [60]:
data_county_known.sort_values(by='ratio').head(20)

Unnamed: 0,id,county,county_raw,school,school_raw,school_recognized,lat,long,is_complete,school_id,county2,county_matched,ratio
1310,279,KERICHO,Keringet,KERINGET,Keringet estate,True,-0.100390,35.608810,True,5568,Kericho,Kirinyaga,59
688,2047,VIHIGA,VihC-gC,MUHUNDU PRIMARY SCH.,boyuf primary school,True,0.164430,34.836980,True,30134,Vihiga,Vihiga,62
1159,1086,TAITA TAVETA,Taita Voi,NDILE,Ndile,True,-3.326020,38.473540,False,26257,Taita Taveta,Taita Taveta,67
3430,3416,KERICHO,KIRECHO,SIMBOIYON,SIMBOYON,True,-0.077780,35.594950,True,5596,Kericho,Kericho,71
2285,2232,NYAMIRA,NYAMARAGA,,1,,,,False,,Nyamira,Nyamira,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3304,3579,UASIN GISHU,washi gishu,TUIYO,Tuiyo premary,True,0.480899,35.169098,True,25122,Uasin Gishu,Uasin Gishu,82
16,13,HOMA BAY,Honda Bay,,tangatanga,False,,,True,,Homa Bay,Homa Bay,82
2501,2553,NYANDARUA,Nyandawa,NYANDARUA HIGH,Nyandarua haye,True,-0.198629,36.376038,True,24462,Nyandarua,Nyandarua,82
867,804,KISUMU,kisumo,MUHORONI FACTORY,muhoroni primary,True,-0.151335,35.174371,True,8161,Kisumu,Kisumu,83


### Conclusion
Looks like matching using fuzzywuzzu process works pretty well.

# Distance matrix matching improvement

In [61]:
# First, let's see how frequently a school falls within short distance of a county border

((distances < 3000).sum(axis=0) > 1).sum(),  ((distances < 1000).sum(axis=0) > 1).sum()

(8825, 3123)

8800 schools fall within 3km of a county border; of those 3100 fall within a single km. That's significant percentage of our dataset, and implies that when offering matches, we should also include shools falling within short distance of the selected county.

In [62]:
# construct county-school membership matrix, with cutoff at 7.5km, and with weight dropping from 1 to 0.7 as distance from the border increases from 0 to 7.5km
county_school_matrix = np.where(distances < 7500, 1 - distances * 0.3 / 7500, 0)

In [63]:
county_school_matrix = scipy.sparse.csr_matrix(county_school_matrix)
county_school_matrix

<47x37930 sparse matrix of type '<class 'numpy.float64'>'
	with 62855 stored elements in Compressed Sparse Row format>

In [64]:
# construct new matcher which uses the distance matrix rather than the strict county assignment
m23sw_m = Matcher(ngram_range=(2,3), df=gdf_schools, counties=gdf_counties.county, stop_words=stop_words, county_school_matrix=county_school_matrix)

In [65]:
# let's pick a schools few km away from a county border
county_id, school_id = np.argwhere((4000 < distances) & (distances < 5000))[0]

In [66]:
gdf_schools.iloc[school_id][['name', 'county']]

name             CHEGILET
county    Elgeyo Marakwet
Name: 2206, dtype: object

In [67]:
gdf_counties.county[county_id]

'Baringo'

In [68]:
# strict county matcher finds it under 'Elgeyo Marakwet'...
m23sw.match_df('cheglet', county='Elgeyo Marakwet', n=3)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
2206,CHEGILET,Primary,Public,Elgeyo Marakwet,KEIYO,KAPTUM,Keiyo North,Emsoo,35.59659,0.83061,"Ministry of Education, 2016",POINT (121100.911 10091970.707),2206,Elgeyo Marakwet,0.0,Elgeyo Marakwet,0.0,0.590961
25659,CHEGILET SECINDARY SCHOOL,Secondary,Public,Elgeyo Marakwet,KEIYO,KAPTUM,Keiyo North,Emsoo,35.601344,0.827696,"Ministry of Education, 2016",POINT (121630.519 10091647.549),25659,Elgeyo Marakwet,0.0,Elgeyo Marakwet,0.619978,0.394966
2205,CHEBONET,Primary,Public,Elgeyo Marakwet,KEIYO,KAMARINY,Keiyo North,Kamariny,35.46816,0.65398,"Ministry of Education, 2016",POINT (106769.339 10072422.938),2205,Elgeyo Marakwet,0.0,Elgeyo Marakwet,24.282736,0.277454


In [69]:
# ... but not under Baringo
m23sw.match_df('cheglet', county='Baringo', n=3)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
475,LEGETETWET,Primary,Public,Baringo,MOGOTIO,SOI,Mogotio,Mogotio,35.897977,0.037486,"Ministry of Education, 2016",POINT (154652.686 10004149.450),475,Baringo,0.0,Baringo,0.0,0.210319
146,CHEMURA,Primary,Public,Baringo,BARINGO NORTH,MUCHUKWO,Baringo North,Barwessa,35.62408,0.586429,"Ministry of Education, 2016",POINT (124145.213 10064931.477),146,Baringo,0.0,Baringo,68.215731,0.189518
22,KAPCHEREBET,Primary,Public,Baringo,BARINGO CENTRAL,KABARNET,Baringo Central,Kapropita,35.7192,0.45235,"Ministry of Education, 2016",POINT (134739.424 10050080.907),22,Baringo,0.0,Baringo,50.231711,0.181023


In [70]:
# However, the matcher using distance matrix will find it under Baringo too...
m23sw_m.match_df('cheglet', county='Baringo', n=5)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
2206,CHEGILET,Primary,Public,Elgeyo Marakwet,KEIYO,KAPTUM,Keiyo North,Emsoo,35.59659,0.83061,"Ministry of Education, 2016",POINT (121100.911 10091970.707),2206,Elgeyo Marakwet,0.0,Elgeyo Marakwet,0.0,0.491829
25659,CHEGILET SECINDARY SCHOOL,Secondary,Public,Elgeyo Marakwet,KEIYO,KAPTUM,Keiyo North,Emsoo,35.601344,0.827696,"Ministry of Education, 2016",POINT (121630.519 10091647.549),25659,Elgeyo Marakwet,0.0,Elgeyo Marakwet,0.619978,0.333457
475,LEGETETWET,Primary,Public,Baringo,MOGOTIO,SOI,Mogotio,Mogotio,35.897977,0.037486,"Ministry of Education, 2016",POINT (154652.686 10004149.450),475,Baringo,0.0,Baringo,94.343845,0.210319
146,CHEMURA,Primary,Public,Baringo,BARINGO NORTH,MUCHUKWO,Baringo North,Barwessa,35.62408,0.586429,"Ministry of Education, 2016",POINT (124145.213 10064931.477),146,Baringo,0.0,Baringo,27.323224,0.189518
22,KAPCHEREBET,Primary,Public,Baringo,BARINGO CENTRAL,KABARNET,Baringo Central,Kapropita,35.7192,0.45235,"Ministry of Education, 2016",POINT (134739.424 10050080.907),22,Baringo,0.0,Baringo,44.21482,0.181023


In [71]:
# ...but won't find under distant counties, such as Nairobi
m23sw_m.match_df('cheglet', county='Nairobi', n=3)

Unnamed: 0,name,LEVEL,Status,County,DISTRICT,ZONE,SUB_COUNTY,Ward,long,lat,Source,geometry,id,county_computed,distance_computed,county,dist,score
14964,CHELETA,Primary,Public,Nairobi,WESTLANDS,PARKLANDS,Westlands,Karura,36.822975,-1.222174,"Ministry of Education, 2016",POINT (257747.695 9864814.589),14964,Nairobi,0.0,Nairobi,0.0,0.395634
36164,CHELETE PRI SCH,Primary,,NAIROBI NORTH,,,,KARURA,36.822173,-1.219351,"Open Data Kenya, School 2007",POINT (257658.188 9865126.827),36164,Nairobi,0.0,Nairobi,0.32632,0.390017
34077,CHILDREN ANGLES OF GOD ACADEMY PRI,Primary,,NAIROBI WEST,,,,GATINA,36.75578,-1.27757,"Open Data Kenya, School 2007",POINT (250271.934 9858680.806),34077,Nairobi,0.0,Nairobi,9.682108,0.235306


# Curating the schools dataset

This final section is self-sufficient - one should be able to start fresh kernel and execute just this section.
The end result is cleaned up schools dataset ready to be used for matching, along with the distance matrix.

In [72]:
import pandas as pd
import geopandas as gpd
import geohash
from geopy.distance import great_circle
import re
import numpy as np
import shapefile
import scipy
import scipy.sparse

In [73]:
# shape file data obtained from https://datacatalog.worldbank.org/dataset/kenya-schools
# dataset includes primary and secondary school, combined from two different data sources
# note that primary school data has ward and sub-county info, but secondary school data has only county and lat/long marked up
#!wget https://energydata.info/dataset/2fda191d-c3c6-4002-8c82-daa02008a9e3/resource/849830e2-fcb5-4b42-8d33-e42c7c1e90b4/download/schools.zip
#!unzip -n schools.zip   

In [74]:
gdf_schools = gpd.read_file('Schools/Schools.shp').rename(
    columns={'SCHOOL_NAM': 'name', 'X_Coord': 'long', 'Y_Coord': 'lat'}).drop(columns=['OBJECTID', 'CODE', 'LEVEL', 'Status', 'DISTRICT', 'ZONE', 'Source']
).drop_duplicates(['name', 'County', 'lat', 'long'])
gdf_schools

Unnamed: 0,name,County,SUB_COUNTY,Ward,long,lat,geometry
0,BAKWANIN,Baringo,Baringo Central,Sacho,35.797080,0.409550,POINT (143417.238 10045338.886)
1,BEKIBON,Baringo,Baringo South,Marigat,35.884060,0.336400,POINT (153107.652 10037237.735)
2,BOKORIN,Baringo,Baringo Central,Kapropita,35.771770,0.532180,POINT (140602.763 10058916.014)
3,BOROWONIN,Baringo,Baringo Central,Kapropita,35.778640,0.444870,POINT (141363.771 10049249.854)
4,BOSIN,Baringo,Baringo Central,Sacho,35.795450,0.438090,POINT (143236.887 10048498.462)
...,...,...,...,...,...,...,...
37925,SALVATION ARMY NAWOITORONG SEC SCH,TURKANA,,KANAMKEMER,35.620267,3.113988,POINT (124253.791 10344795.181)
37926,MOYALE ODDA MILITARY CAMP SEC,MARSABIT,,BUTIYE,39.090567,3.463152,POINT (510059.563 10382787.508)
37927,OBBU SEC,MARSABIT,,SOLOLO,38.644848,3.546583,POINT (460555.403 10392016.569)
37928,NAPATA REFUGEE SEC SCH,TURKANA,,LOPUR,34.835844,3.739390,POINT (37206.754 10414419.116)


In [75]:
#!mkdir -p counties/arcgis
#!cd counties/arcgis && curl --compressed 'https://prod-hub-indexer.s3.amazonaws.com/files/071bc497268b4643b68fcdbde2b13a7e/0/full/4326/071bc497268b4643b68fcdbde2b13a7e_0_full_4326.zip' > data.zip && unzip data.zip

In [76]:
# Note that we make sure to use the same CRS projection that was used in schools dataset
gdf_counties = gpd.read_file('counties/arcgis/GIS_GISADMIN_IEBC_counties.shp').rename(columns={'COUNTY_NAM': 'county'}).to_crs(gdf_schools.crs)
gdf_counties.head()

Unnamed: 0,FID,UNIT_AREA,UNIT_PERIM,DISTRICT,COUNT_,county,CODE,SHAPE_Leng,SHAPE_Area,geometry
0,1,0.69621,5.468653,Baringo,172.0,Baringo,30,6.533348,0.884748,"POLYGON ((142052.029 10183272.681, 142122.980 ..."
1,2,0.11007,2.030379,Bomet,109.0,Bomet,36,3.151607,0.193246,"POLYGON ((107391.541 9955787.802, 107892.876 9..."
2,3,0.161656,1.858956,Siaya,49.0,Siaya,41,2.959541,0.286025,"POLYGON ((-27463.364 10034163.846, -27402.887 ..."
3,4,0.167501,2.892534,Bungoma,108.0,Bungoma,39,3.198102,0.245166,"POLYGON ((11989.362 10122424.631, 11999.559 10..."
4,5,0.106271,2.197821,Kericho,106.0,Kericho,35,3.765482,0.209301,"POLYGON ((107386.433 9955798.869, 107372.272 9..."


In [77]:
# assign canonical county name to each school, where possible
from fuzzywuzzy import process as fuzz_process
schools_county_names_map = {name: fuzz_process.extractOne(name, gdf_counties.county)[0] for name in gdf_schools.County.unique() if name}
# note that we use lowercase 'county' for the normalized county name
gdf_schools['county'] = gdf_schools.County.apply(lambda name: schools_county_names_map.get(name, ''))

In [78]:
# Now we compute the distance matrix between each county and each school. We'll use this to assign schools to counties, but also to find cases
# where a school is close to the border of two or more counties. All distances are in meters.

In [79]:
import multiprocessing

def dist(cg):
    return gdf_schools.distance(cg)
with multiprocessing.Pool() as pool:
    distances = np.array(pool.map(dist, gdf_counties.geometry))

In [80]:
# assign computed county to each record, by taking closest county 
gdf_schools['county_computed'] = gdf_counties.county[distances.argmin(axis=0)].to_numpy()
gdf_schools['distance_computed'] = distances.min(axis=0)
gdf_schools

Unnamed: 0,name,County,SUB_COUNTY,Ward,long,lat,geometry,county,county_computed,distance_computed
0,BAKWANIN,Baringo,Baringo Central,Sacho,35.797080,0.409550,POINT (143417.238 10045338.886),Baringo,Baringo,0.0
1,BEKIBON,Baringo,Baringo South,Marigat,35.884060,0.336400,POINT (153107.652 10037237.735),Baringo,Baringo,0.0
2,BOKORIN,Baringo,Baringo Central,Kapropita,35.771770,0.532180,POINT (140602.763 10058916.014),Baringo,Baringo,0.0
3,BOROWONIN,Baringo,Baringo Central,Kapropita,35.778640,0.444870,POINT (141363.771 10049249.854),Baringo,Baringo,0.0
4,BOSIN,Baringo,Baringo Central,Sacho,35.795450,0.438090,POINT (143236.887 10048498.462),Baringo,Baringo,0.0
...,...,...,...,...,...,...,...,...,...,...
37925,SALVATION ARMY NAWOITORONG SEC SCH,TURKANA,,KANAMKEMER,35.620267,3.113988,POINT (124253.791 10344795.181),Turkana,Turkana,0.0
37926,MOYALE ODDA MILITARY CAMP SEC,MARSABIT,,BUTIYE,39.090567,3.463152,POINT (510059.563 10382787.508),Marsabit,Marsabit,0.0
37927,OBBU SEC,MARSABIT,,SOLOLO,38.644848,3.546583,POINT (460555.403 10392016.569),Marsabit,Marsabit,0.0
37928,NAPATA REFUGEE SEC SCH,TURKANA,,LOPUR,34.835844,3.739390,POINT (37206.754 10414419.116),Turkana,Turkana,0.0


Handful of schools have closest distance > 0 which means they don't fall into any county. However, the distance is generally quite small, and these seem to be schools on the country border.

In [81]:
gdf_schools.query('distance_computed > 0')['distance_computed'].describe()

count      15.000000
mean      364.876741
std       327.207352
min        72.691738
25%       120.152987
50%       271.568205
75%       537.056420
max      1289.874248
Name: distance_computed, dtype: float64

In [82]:
# For the records with missing county, fill in the computed value
gdf_schools['county'] = gdf_schools.county.where(gdf_schools.county != '', gdf_schools.county_computed)

In [83]:
# compute distance between school and county
gdfc = gdf_counties.set_index('county')
gdf_schools['distance_orig'] = gdf_schools.apply(lambda t: t.geometry.distance(gdfc.loc[t.county, 'geometry']), axis=1)

In [84]:
# filter out any schools marked as being >= 20 km from their county
mask = gdf_schools.distance_orig < 20000
df_schools_final = gdf_schools[mask].drop(columns=['geometry', 'county_computed', 'distance_computed', 'distance_orig'])
county_school_matrix = np.where(distances < 7500, 1 - distances * 0.3 / 7500, 0)
county_school_matrix = county_school_matrix[:, mask]
county_school_matrix = scipy.sparse.csr_matrix(county_school_matrix)

We save everything needed to recreate the model. Note that we save the list of counties not because their precise spellings matter, but because their order matters. The order of counties passed to `Matcher` constructor must match the order of counties implicit in `county_school_matrix` saved below.

In [86]:
# Save results
# df_schools_final.to_csv('df_schools.csv', index=False)
# scipy.sparse.save_npz('county_school_matrix', county_school_matrix)
# gdf_counties.county.to_csv('counties.csv', index=False)