# Scraping the Florida Scorecard Website

In [1]:
import os
from os import path

import requests
import pandas as pd

## Assert folders are in place

In [2]:
folders = [
    '../data/intermediary/scorecard/equity/',
    '../data/input/scorecard/raw_equity/',
    '../data/input/scorecard/enrollment',
    '../data/intermediary/scorecard/characteristics/',
    '../data/intermediary/scorecard/names/',
    '../data/intermediary/scorecard/enrollment/'
]

for folder in folders:
    if path.exists(folder):
        print("{folder} is already here!".format(folder=folder))
    else:
        try:
            os.makedirs(folder)
        except OSError:
            print("I couldn't make {folder}!".format(folder=folder))
        else:
            print("{folder} successfully made!".format(folder=folder))

../data/intermediary/scorecard/equity/ successfully made!
../data/input/scorecard/raw_equity/ successfully made!
../data/input/scorecard/enrollment successfully made!
../data/intermediary/scorecard/characteristics/ successfully made!
../data/intermediary/scorecard/names/ successfully made!
../data/intermediary/scorecard/enrollment/ successfully made!


## Gather scorecard base data

In [3]:
url = "https://edudata.fldoe.org/ReportCards/data/rc_base.csv"
folder = '../data/input/scorecard/'
base_filename = folder + url.rsplit('/', 1)[-1]

if not path.exists(base_filename):
    r = requests.get(url)
    with open(base_filename, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)

## Collect and clean district-specific equity data

In [4]:
districts = [
    "broward",
    'hillsborough',
    'miami-dade'
]   

In [5]:
raw_keys = pd.read_csv(base_filename)
raw_keys = raw_keys[['district_number', 'district_name', 'school_number', 'school_name_l', 'school_name_s']]

for district in districts:
    # Get district information
    keys = raw_keys[raw_keys['district_name'] == district.upper()]
    district_number = keys.iloc[0]['district_number']

    url = 'https://edudata.fldoe.org/ReportCards/data/Edqual/Expeff/{:d}.csv'.format(district_number)
    folder = '../data/input/scorecard/raw_equity/'

    # Download raw csv
    district_file = folder + '{:02d}.csv'.format(district_number)
    
    if not path.exists(district_file):
        online_file = url.format(district_number)
        print(online_file)
        r = requests.get(online_file)
    
        with open(district_file, 'wb') as fd:
            for chunk in r.iter_content(chunk_size=128):
                fd.write(chunk)

    raw_equity = pd.read_csv(district_file) 
    
    # Clean and format
    equity = raw_equity.drop_duplicates(subset=['s'])[['s', 'd', 'c']]
    equity = equity[equity['s'] != 0]
    equity['school_number'] = equity['s']
    equity['district_number'] = equity['d']
    equity[['minority','poverty', 'title_i']] = equity.c.apply( 
       lambda x: pd.Series(str(x).split("/"))) 

    equity.drop(['c', 's', 'd'], axis='columns', inplace=True)
    
    # Save cleaned csv
    equity.to_csv('../data/intermediary/scorecard/equity/' + district + '.csv', index=False)
    

https://edudata.fldoe.org/ReportCards/data/Edqual/Expeff/6.csv
https://edudata.fldoe.org/ReportCards/data/Edqual/Expeff/29.csv
https://edudata.fldoe.org/ReportCards/data/Edqual/Expeff/13.csv


## Collect and clean Miami-Dade 2020 enrollment data

In [6]:
url = 'https://edudata.fldoe.org/ReportCards/data/Enrollment/13.csv'
folder = '../data/input/scorecard/enrollment/'

district_file = folder + 'miami_dade.csv'
    
if not path.exists(district_file):
    r = requests.get(url)

    with open(district_file, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)
enrollment = pd.read_csv(district_file) 
    
enrollment = enrollment[enrollment.y == 1920][
    enrollment.g.str.contains('Total')][
    enrollment.s != 0][['s', 'enn']]
    
enrollment = enrollment.rename(columns={
    's': 'school_number',
    'enn': 'total_students'
})

enrollment.to_csv('../data/intermediary/scorecard/enrollment/' + 'miami-dade' + '.csv', index=False)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


## Clean additional district-specific characteristic data

In [7]:
def get_grade(row):
    grade = row['grade_1819']
    pip = row['improvement_rating']
    
    if grade in ['A', 'B']:
        return grade
    elif grade in ['C', 'D', 'F']:
        return 'C or below'
    elif pip in ['C', 'I', 'M', 'U']:
        return 'Improvement Plan'
    else:
        return "N/A"

In [8]:
raw_keys = pd.read_csv(base_filename)

for district in districts:
    keys = raw_keys[raw_keys['district_name'] == district.upper()]
    characteristics = keys[['district_number', 'school_number', 'school_type', 'alt_schl', 'charter', 'grade_1819', 'improvement_rating', 'federal_index', 'CEP_Percentage']]
    characteristics['grade'] = characteristics.apply(lambda row : get_grade(row), axis = 1)
    columns = [
        'district_number',
        'school_number',
        'school_type',
        'grade',
        'charter',
        'alt_schl',
        'federal_index',
        'CEP_Percentage'
    ]
    characteristics[columns].to_csv('../data/intermediary/scorecard/characteristics/' + district + '.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Gather school names for crosswalk development

In [9]:
raw_keys = pd.read_csv(base_filename)

for district in districts:
    raw_keys = raw_keys[['district_number', 'district_name', 'school_number', 'school_name_l', 'school_name_s']]
    keys = raw_keys[raw_keys['district_name'] == district.upper()]
    keys.to_csv('../data/intermediary/scorecard/names/' + district + '.csv', index=False)