In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [35]:
def convert_html_to_df(html):
    schools = []
    grades = []
    districts = []
    ratings = []
    students = []
    ratios = []
    rankings = []
    
    for i in range(25):
        result = html.find(id=f'search-result__description--{i}')
        if result is None:
            print(f'less than 25 results on this page: i = {i}')
            break
            
        schools.append(result.find('h2').text)
        
        grade = None
        district = None
        rating = None
        student = None
        ratio = None
        ranking = -1
            
        for attr in result.find_all('p'):
            attr = attr.text        
            if attr.endswith(', NJ,'):
                district = attr[:-5]
            elif attr.startswith('Overall Niche Grade:'):
                rating = attr[21:-1]
            elif attr.startswith('Students:'):
                student = int(attr[10:-1].replace(',', ''))
            elif attr.startswith('Student-Teacher Ratio:'):
                ratio = attr[23:-1]
            elif attr.endswith('Best Public Elementary Schools in New Jersey.'):
                ranking = int(attr[1:-46])
            elif 'Niche users give it an average review of' in attr or \
                attr == 'Blue checkmark.' or \
                attr == 'Public School,' or \
                attr.startswith('Featured Review: ') or \
                (attr.startswith('Read') and attr.endswith('reviews.')):
                pass
            else:
                grade = attr[:-1]
                
        grades.append(grade)
        districts.append(district)
        ratings.append(rating)
        students.append(student)
        ratios.append(ratio)
        rankings.append(ranking)
        
    return pd.DataFrame({
        'ranking': rankings,
        'school': schools,
        'district': districts,
        'rating': ratings,
        'grade': grades,
        'student': students,
        'ratio': ratios
    })

In [36]:
def convert_url_to_df(url):
    hdr = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
        'referer': 'https://www.niche.com/k12/search/best-public-elementary-schools/s/new-jersey/',
        'cookie': 'xid=b3c20c19-42db-4094-8d50-7142d9a23143; _gcl_au=1.1.1764786410.1656946529; niche_npsSurvey=0; niche_fullStory=0; niche_singleFirstPageview=1; niche_singleK12Pageview=1; ab.storage.deviceId.97a5be8e-e2ba-4f2c-9159-9ae910fa9648=%7B%22g%22%3A%225c9a69cf-c5ab-baad-f527-456aadad4cb1%22%2C%22c%22%3A1656946530441%2C%22l%22%3A1656946530441%7D; _rdt_uuid=1656946530523.7700245a-62e8-4b91-925f-c8a35a497bb1; _tt_enable_cookie=1; _ttp=04063741-b25b-4435-91e2-dc86a96924d0; pxcts=593ceeca-fba9-11ec-895c-4b6172526250; _pxvid=593ce386-fba9-11ec-895c-4b6172526250; _scid=4af15d02-9fe8-4cb3-aeeb-ae86ca8e1cc8; __qca=P0-1408674187-1656946531008; niche_singleCollegePageview=1; niche_singlePTLPageview=1; niche_singleProfilePageview=1; experiments=profile_sticky_header_cta%7Ccontrol%7Chomepage_hero_ctas%5E%5E%5E%240%7C1%7C2%7C1%5D; _sctr=1|1659412800000; _clck=1yx3ysj|1|f3u|0; niche_cookieConsent=true; _gid=GA1.2.1766029719.1659930534; navigation=%7B%22location%22%3A%7B%22guid%22%3A%22ed4102c4-b16f-4ccd-8d8b-ccfeb8fa21ea%22%2C%22type%22%3A%22State%22%2C%22name%22%3A%22New%20Jersey%22%2C%22url%22%3A%22new-jersey%22%7D%2C%22navigationMode%22%3A%22full%22%2C%22vertical%22%3A%22k12%22%2C%22mostRecentVertical%22%3A%22k12%22%2C%22suffixes%22%3A%7B%22colleges%22%3A%22%2Fs%2Fnew-jersey%2F%22%2C%22graduate-schools%22%3A%22%2Fs%2Fnew-jersey%2F%22%2C%22k12%22%3A%22%2Fs%2Fnew-jersey%2F%22%2C%22places-to-live%22%3A%22%2Fs%2Fnew-jersey%2F%22%2C%22places-to-work%22%3A%22%2Fs%2Fnew-jersey%2F%22%7D%7D; recentlyViewed=entityHistory%7CentityName%7CEdith%2BA.%2BBogert%2BElementary%2BSchool%7CentityGuid%7Cc3f1e481-e7a9-45e8-8379-be1cacb6c702%7CentityType%7CSchool%7CentityFragment%7Cedith-a-bogert-elementary-school-upper-saddle-river-nj%7CDeerfield%2BElementary%2BSchool%7C7ad5353c-ff72-42d1-8f94-508d8df39c17%7Cdeerfield-elementary-school-short-hills-nj%7CNut%2BSwamp%2BElementary%2BSchool%7C5f7989d6-bf5c-4f35-959e-ec45a62e5c2a%7Cnut-swamp-elementary-school-middletown-nj%7CPrinceton%7C5c4fa648-274a-4126-9732-60041dff7862%7CTown%7Cprinceton-mercer-nj%7CsearchHistory%7CNew%2BJersey%7Ced4102c4-b16f-4ccd-8d8b-ccfeb8fa21ea%7CState%7Cnew-jersey%7CNew%2BYork%2BCity%2BArea%7C9b68e2cd-0da9-4ec6-ac31-33e2d89920aa%7CMetroArea%7Cnew-york-city-metro-area%7CMercer%2BCounty%7C6c9953c4-b55c-49b7-90ae-6ecd61528343%7CCounty%7Cmercer-county-nj%5E%5E%5E%240%7C%40%241%7C2%7C3%7C4%7C5%7C6%7C7%7C8%5D%7C%241%7C9%7C3%7CA%7C5%7C6%7C7%7CB%5D%7C%241%7CC%7C3%7CD%7C5%7C6%7C7%7CE%5D%7C%241%7CF%7C3%7CG%7C5%7CH%7C7%7CI%5D%5D%7CJ%7C%40%241%7CK%7C3%7CL%7C5%7CM%7C7%7CN%5D%7C%241%7CO%7C3%7CP%7C5%7CQ%7C7%7CR%5D%7C%241%7CS%7C3%7CT%7C5%7CU%7C7%7CV%5D%5D%5D; _pxff_rf=1; _pxff_fp=1; _px3=88457ccddb09b720a170979208e5eb6ea985f132182af9d072e8940eee869deb:DYCNjd640VicSVFmIS+v8kaSLivrLLJGWBgSaJ3MB8O/+0gHap7l4zF0RwbtCDzaNzyJzIHeQ1o527wCtVC3/Q==:1000:RGmL4DHscgIliGrIsT0/cBvhb1NwsF0Vo74zxm92AykLxIj1FMFtSY4XKg6mJ4nSwXSdrZ+6qF8aPYoR7RWA1DH0t07/fwN5RXlELaf4Fvl0p+KkL1C9jjo9pjQQUbRP3g99ykhpKztfCbgOVy6iJYtZQcjlMmuY9OajDlKWZpaI+KssPhRRq6r8r1UbFgnUNmazjTJEglJ1gF2S0NTSWQ==; niche_sessionPageCount=671; ab.storage.sessionId.97a5be8e-e2ba-4f2c-9159-9ae910fa9648=%7B%22g%22%3A%22193435aa-e77e-bd96-41e2-db7f34eccd24%22%2C%22e%22%3A1659973723131%2C%22c%22%3A1659970641548%2C%22l%22%3A1659971923131%7D; _ga=GA1.2.1464354257.1656946530; _dc_gtm_UA-2431522-39=1; _uetsid=05b17d7016cd11ed88bc958605365a04; _uetvid=58d37b10fba911ec9be649c2d4f458c1; _clsk=86bou1|1659971924341|14|1|n.clarity.ms/collect; _ga_4TVMRNQ02W=GS1.1.1659970639.36.1.1659971928.0'
    }
    response = requests.get(url, headers=hdr)
    if response.status_code != 200:
        print(response.status_code, url)
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    return convert_html_to_df(soup)

In [16]:
url_base = (
    'https://www.niche.com/k12/search/best-public-elementary-schools/s/new-jersey/?'
)
urls = [ f'{url_base}page={i+1}' for i in range(64) ]
dfs = [ convert_url_to_df(url) for url in urls ]

In [40]:
df_all = pd.concat(dfs, axis=0)
df_all.reset_index(drop=True, inplace=True)
# df_all = df_all[df_all.rating != 'A minus']

In [41]:
df_all

Unnamed: 0,ranking,school,district,rating,grade,student,ratio
0,-1,Washington School,Millburn Township School District,A+,5,388,
1,2,Princeton Charter School,PRINCETON,A+,K-8,426,10 to 1
2,3,Alpine Elementary School,ALPINE,A+,K-8,151,7 to 1
3,4,Wyoming Elementary School,Millburn Township School District,A+,K-4,331,12 to 1
4,5,Hartshorn Elementary School,Millburn Township School District,A+,K-4,397,11 to 1
...,...,...,...,...,...,...,...
1583,-1,Ocean Regional School,TOMS RIVER,,2-11,22,1 to 1
1584,-1,NuView Academy,PISCATAWAY,,K-12,21,4 to 1
1585,-1,Regional Day School,MORRISTOWN,,"PK, K-12",17,2 to 1
1586,-1,Union Regional School,SCOTCH PLAINS,,2-12,14,


In [42]:
df_all.to_csv('niche_ranking.csv')

In [None]:
df_a_plus = df_all[df_all.rating == 'A+']
df_a = df_all[df_all.rating == 'A']

In [None]:
a_plus_per_district = df_a_plus.groupby('district').count()[['school']]
a_per_district = df_a.groupby('district').count()[['school']]

In [None]:
a_plus_per_district.columns = ['A+']
a_per_district.columns = ['A']

In [None]:
count_per_district = a_plus_per_district.merge(a_per_district, how='outer', left_index=True, right_index=True)

In [None]:
count_per_district.fillna(0, inplace=True)
count_per_district['A+'] = count_per_district['A+'].astype(int)
count_per_district['A'] = count_per_district['A'].astype(int)

In [None]:
count_per_district['weighted count'] = count_per_district['A+'] * 10 + count_per_district['A'] * 1

In [None]:
count_per_district.sort_values('weighted count', ascending=False, inplace=True)

In [None]:
count_per_district

In [None]:
count_per_district.to_csv('count_per_district.csv')