In [1]:
import pandas as pd
import time

In [4]:
from ucd_sta_221_project.api.college_scorecard import (
    get_scorecards_by_state,
    get_latest_student_scorecard_data_by_state,
    get_scorecard_by_college,
)

from ucd_sta_221_project.api.cccco import (
    get_ccc_colleges,
    get_ccc_districts,
    get_ccc_programs
)

# Exploring the API

In [3]:
import os
import requests

url = "https://api.data.gov/ed/collegescorecard/v1/schools"

fields = [
    "school.name",
    "2018.student.size",
    "2018.student.enrollment", # You can return all elements within a dict by requesting the top level instead of "2018.student.enrollment.undergrad_12_month",
    # "2018.student.demographics.over_23_at_entry",
    # "2018.student.demographics.first_generation",
    # "2018.student.demographics.median_hh_income",
    # "2018.student.demographics.student_faculty_ratio",
    # "2018.student.FAFSA_applications",
]

params = {
    "api_key": os.getenv("SCORECARDAPI"),
    "school.name": "Solano College",
    # "fields": ",".join(fields)
}

response = requests.get(url, params=params)
print(response.json()["metadata"])
data = response.json()

{'page': 0, 'total': 1, 'per_page': 20}


In [12]:
data_results = data.get("results")[0]
data_results

{'2018.student.size': 7972,
 '2018.student.enrollment.all': None,
 '2018.student.enrollment.undergrad_12_month': 13835,
 '2018.student.enrollment.grad_12_month': None,
 'school.name': 'Solano Community College'}

In [14]:
# data.keys() # dict_keys(['metadata', 'results'])
# len(data.get("results")) # 1 - list of dicts

data_results = data.get("results")[0]
data_results.keys() # dict_keys(['latest', 'school', 'location', 'id', 'ope6_id', 'ope8_id', 'fed_sch_cd'])

dict_keys(['latest', 'school', 'location', 'id', 'ope6_id', 'ope8_id', 'fed_sch_cd'])

In [15]:
# Drill down into the values of data_results

data_results_latest = data_results.get("latest")
# data_results_latest.keys()
# dict_keys(
#     [
#         'school',
#         'student',
#         'cost',
#         'aid',
#         'earnings',
#         'completion',
#         'repayment',
#         'admissions',
#         'academics',
#         'programs'
#     ]
# )

data_results_school = data_results.get("school")
# data_results_school # School metadata

data_results_location = data_results.get("location")
# data_results_location # Lat-Long dict

data_results_id = data_results.get("id")
# data_results_id # int

data_results_ope6_id = data_results.get("ope6_id")
# data_results_ope6_id # str OPE6 ID

data_results_ope8_id = data_results.get("ope8_id")
# data_results_ope8_id # str OPE8 ID

data_results_fed_sch_cd = data_results.get("fed_sch_cd")
# data_results_fed_sch_cd # str Federal School Code

In [16]:
# `Results > Latest > School` contains more info than `Results > School`

data_results_latest_school = data_results_latest.get("school")
print(f"{data_results_latest_school.keys() = }")
print(f"{data_results_school.keys()        = }")

# data_results_latest_school.keys() - data_results_school.keys() # {'peps_ownership', 'state_fips'}
# data_results_school.keys() - data_results_latest_school.keys() # set()

data_results_latest_school.keys() = dict_keys(['zip', 'city', 'name', 'alias', 'state', 'locale', 'address', 'dolflag', 'branches', 'men_only', 'operating', 'ownership', 'region_id', 'accreditor', 'school_url', 'state_fips', 'women_only', 'main_campus', 'online_only', 'endowment', 'carnegie_basic', 'faculty_salary', 'ownership_peps', 'peps_ownership', 'accreditor_code', 'ft_faculty_rate', 'sector', 'carnegie_undergrad', 'degree_urbanization', 'under_investigation', 'price_calculator_url', 'carnegie_size_setting', 'minority_serving', 'religious_affiliation', 'open_admissions_policy', 'title_iv', 'degrees_awarded', 'tuition_revenue_per_fte', 'instructional_expenditure_per_fte', 'institutional_characteristics'])
data_results_school.keys()        = dict_keys(['zip', 'city', 'name', 'alias', 'state', 'locale', 'address', 'dolflag', 'branches', 'men_only', 'operating', 'ownership', 'region_id', 'accreditor', 'school_url', 'women_only', 'main_campus', 'online_only', 'endowment', 'carnegie_bas

In [17]:
data_results_latest_student = data_results_latest.get("student")
# data_results_latest_student.keys()
# dict_keys(
#     [
#         'size',
#         'grad_students',
#         'size_category',
#         'enrollment',
#         'share_25_older',
#         'part_time_share',
#         'demographics',
#         'FAFSA_applications',
#         'fafsa_sent',
#         'part_time_share_2000',
#         'family_income',
#         'share_firstgeneration',
#         'parents_education_level',
#         'share_lowincome',
#         'valid_dependency_status',
#         'students_with_pell_grant',
#         'share_first',
#         'share_independent_students',
#         'share_highincome',
#         'undergrads_non_degree_seeking',
#         'share_middleincome',
#         'avg_dependent_income',
#         'retention_rate',
#         'avg_independent_income',
#         'share_dependent_lowincome',
#         'share_independent_lowincome',
#         'share_dependent_highincome',
#         'share_independent_highincome',
#         'share_dependent_middleincome',
#         'share_firstgeneration_parents',
#         'share_independent_middleincome',
#         'undergrads_with_pell_grant_or_federal_student_loan',
#         'retention_rate_suppressed',
#         'dcs_undergrads_with_pell_grant_or_federal_student_loan',
#         'ftft_undergrads_with_pell_grant_or_federal_student_loan',
#         'dcs_undergrads_with_pell_grant_or_federal_student_loan_pooled',
#         'ftft_undergrads_with_pell_grant_or_federal_student_loan_pooled'
#     ]
# )

In [8]:
data_results_latest_cost = data_results_latest.get("cost")
# data_results_latest_cost.keys()
# dict_keys(
#     [
#         'booksupply',
#         'tuition',
#         'roomboard',
#         'title_iv',
#         'avg_net_price',
#         'otherexpense',
#         'attendance',
#         'net_price',
#         'program_reporter'
#     ]
# )

In [9]:
data_results_latest_aid = data_results_latest.get("aid")
# data_results_latest_aid.keys()
# dict_keys(
#     [
#         'loan_principal',
#         'pell_grant_rate',
#         'federal_loan_rate',
#         'dcs_pell_grant_rate',
#         'plus_loan_pct_lower',
#         'plus_loan_pct_upper',
#         'plus_loan_pct_years',
#         'ftft_pell_grant_rate',
#         'dcs_federal_loan_rate',
#         'cumulative_debt',
#         'ftft_federal_loan_rate',
#         'median_debt',
#         'students_with_any_loan',
#         'portfolio',
#         'dcs_pell_grant_rate_pooled',
#         'plus_loan_pct_lower_pooled',
#         'plus_loan_pct_upper_pooled',
#         'ftft_pell_grant_rate_pooled',
#         'dcs_federal_loan_rate_pooled',
#         'plus_debt',
#         'ftft_federal_loan_rate_pooled',
#         'median_debt_suppressed',
#         'dcs_undergrads_pell_federal_loan_years',
#         'ftft_undergrads_pell_federal_loan_years',
#         'plus_debt_suppressed'
#     ]
# )

In [10]:
data_results_latest_earnings = data_results_latest.get("earnings")
# data_results_latest_earnings.keys()
# dict_keys(
#     [
#         'student_count',
#         '6_yrs_after_entry',
#         '7_yrs_after_entry',
#         '10_yrs_after_entry',
#         '11_yrs_after_entry',
#         '1_yr_after_completion',
#         '4_yrs_after_completion',
#         '5_yrs_after_completion',
#         '8_yrs_after_entry',
#         '9_yrs_after_entry',
#         '3_yrs_after_completion'
#     ]
# )

In [11]:
data_results_latest_completion = data_results_latest.get("completion")
# data_results_latest_completion.keys()
# dict_keys(
#     [
#         'consumer_rate',
#         'pooled_yrs_used',
#         'pooled_yrs_used_100',
#         'pooled_yrs_used_200',
#         'title_iv',
#         'separation_date',
#         '2_yr_completion',
#         '3_yr_completion',
#         '4_yr_completion',
#         '6_yr_completion',
#         '8_yr_completion',
#         'rate_suppressed',
#         'pooled_yrs_used_pell_150',
#         'completion_rate_4yr_100nt',
#         'completion_rate_4yr_150nt',
#         'completion_rate_4yr_200nt',
#         'outcome_years',
#         'completion_cohort_4yr_100nt',
#         'completion_cohort_4yr_150nt',
#         'completion_cohort_4yr_200nt',
#         'transfer_rate',
#         'completion_rate_4yr_150_aian',
#         'completion_rate_4yr_150_nhpi',
#         'completion_rate_4yr_150_asian',
#         'completion_rate_4yr_150_black',
#         'completion_rate_4yr_150_white',
#         'completion_rate_l4yr_150_aian',
#         'completion_rate_l4yr_150_nhpi',
#         'outcome_cohort_nopell',
#         'completion_cohort_4yr_150_aian',
#         'completion_cohort_4yr_150_nhpi',
#         'completion_rate_l4yr_150_asian',
#         'completion_rate_l4yr_150_black',
#         'completion_rate_l4yr_150_white',
#         'completion_cohort_4yr_150_asian',
#         'completion_cohort_4yr_150_black',
#         'completion_cohort_4yr_150_white',
#         'completion_rate_4yr_150_2ormore',
#         'outcome_years_pell',
#         'completion_rate_4yr_150_hispanic',
#         'completion_rate_4yr_150nt_pooled',
#         'completion_rate_4yr_200nt_pooled',
#         'completion_rate_l4yr_150_2ormore',
#         'completion_cohort_4yr_150_2ormore',
#         'completion_rate_l4yr_150_hispanic',
#         'completion_cohort_4yr_150_hispanic',
#         'completion_cohort_4yr_150nt_pooled',
#         'completion_cohort_4yr_200nt_pooled',
#         'completion_rate_four_year_150_pell',
#         'completion_rate_4yr_150_api_pre2010',
#         'completion_rate_less_than_4yr_100nt',
#         'completion_rate_less_than_4yr_150nt',
#         'completion_rate_less_than_4yr_200nt',
#         'completion_cohort_four_year_150_pell',
#         'completion_rate_4yr_150_aian_pre2010',
#         'completion_rate_4yr_150_race',
#         'completion_rate_four_year_100_pooled',
#         'completion_rate_l4yr_150_api_pre2010',
#         'completion_cohort_4yr_150_api_pre2010',
#         'completion_cohort_less_than_4yr_100nt',
#         'completion_cohort_less_than_4yr_150nt',
#         'completion_cohort_less_than_4yr_200nt',
#         'completion_rate_4yr_150_black_pre2010',
#         'completion_rate_4yr_150_white_pre2010',
#         'completion_rate_l4yr_150_aian_pre2010',
#         'completion_rate_l4yr_150_race',
#         'completion_rate_lt_four_year_150_pell',
#         'completion_cohort_4yr_150_aian_pre2010',
#         'completion_cohort_4yr_150_race',
#         'completion_cohort_four_year_100_pooled',
#         'completion_rate_l4yr_150_black_pre2010',
#         'completion_rate_l4yr_150_white_pre2010',
#         'outcome_percentage',
#         'completion_cohort_4yr_150_black_pre2010',
#         'completion_cohort_4yr_150_white_pre2010',
#         'completion_cohort_lt_four_year_150_pell',
#         'completion_rate_lt_four_year_100_pooled',
#         'outcome_cohort',
#         'completion_cohort_less_than_4yr_150_aian',
#         'completion_cohort_less_than_4yr_150_nhpi',
#         'completion_rate_4yr_150_hispanic_pre2010',
#         'completion_cohort_less_than_4yr_150_asian',
#         'completion_cohort_less_than_4yr_150_black',
#         'completion_cohort_less_than_4yr_150_white',
#         'completion_cohort_lt_four_year_100_pooled',
#         'completion_rate_4yr_150_nonresident',
#         'completion_rate_four_year_150_loan_nopell',
#         'completion_rate_four_year_150_pell_pooled',
#         'completion_rate_l4yr_150_hispanic_pre2010',
#         'rate_suppressed_pell',
#         'completion_cohort_4yr_150_hispanic_pre2010',
#         'completion_rate_l4yr_150_nonresident',
#         'completion_rate_less_than_4yr_150nt_pooled',
#         'completion_rate_less_than_4yr_200nt_pooled',
#         'completion_cohort_4yr_150_nonresident',
#         'completion_cohort_four_year_150_loan_nopell',
#         'completion_cohort_four_year_150_pell_pooled',
#         'completion_cohort_less_than_4yr_150_2ormore',
#         'completion_rate_four_year_150_noloan_nopell',
#         'outcome_percentage_pell',
#         'completion_cohort_less_than_4yr_150_hispanic',
#         'completion_cohort_less_than_4yr_150nt_pooled',
#         'completion_cohort_less_than_4yr_200nt_pooled',
#         'completion_rate_lt_four_year_150_loan_nopell',
#         'completion_rate_lt_four_year_150_pell_pooled',
#         'outcome_cohort_pell',
#         'completion_cohort_four_year_150_noloan_nopell',
#         'transfer_rate_suppressed',
#         'completion_cohort_lt_four_year_150_loan_nopell',
#         'completion_cohort_lt_four_year_150_pell_pooled',
#         'completion_rate_lt_four_year_150_noloan_nopell',
#         'outcome_percentage_nopell',
#         'completion_cohort_less_than_4yr_150_api_pre2010',
#         'completion_cohort_less_than_4yr_150_aian_pre2010',
#         'completion_cohort_less_than_4yr_150_race',
#         'completion_cohort_lt_four_year_150_noloan_nopell',
#         'completion_cohort_less_than_4yr_150_black_pre2010',
#         'completion_cohort_less_than_4yr_150_white_pre2010',
#         'completion_cohort_less_than_4yr_150_hispanic_pre2010',
#         'completion_cohort_less_than_4yr_150_nonresident',
#         'outcome_percentage_suppressed',
#         'outcome_percentage_pell_suppressed'
#     ]
# )

In [12]:
data_results_latest_repayment = data_results_latest.get("repayment")
# data_results_latest_repayment.keys()
# dict_keys(
#     [
#         '2_yr_default_rate',
#         '3_yr_default_rate',
#         'repayment_date',
#         '1_yr_repayment',
#         '3_yr_repayment',
#         '5_yr_repayment',
#         '7_yr_repayment',
#         '2_yr_default_rate_denom',
#         '3_yr_default_rate_denom',
#         '1_yr_db_pp_repayment',
#         '4_yr_db_pp_repayment',
#         '5_yr_db_pp_repayment',
#         '10_yr_db_pp_repayment',
#         '1_yr_bb_pp_repayment',
#         '1_yr_db_fed_repayment',
#         '20_yr_db_pp_repayment',
#         '2_yr_bb_pp_repayment',
#         '3_yr_bb_pp_repayment',
#         '4_yr_bb_pp_repayment',
#         '4_yr_db_fed_repayment',
#         '5_yr_db_fed_repayment',
#         '10_yr_db_fed_repayment',
#         '1_yr_bb_fed_repayment',
#         '20_yr_db_fed_repayment',
#         '2_yr_bb_fed_repayment',
#         '3_yr_bb_fed_repayment',
#         '4_yr_bb_fed_repayment',
#         '3_yr_repayment_suppressed',
#         '2_yr_bb_fed_repayment_suppressed',
#         'repayment_cohort'
#     ]
# )

In [13]:
data_results_latest_admissions = data_results_latest.get("admissions")
# data_results_latest_admissions.keys()
# dict_keys(
#     [
#         'test_requirements',
#         'admission_rate', # Has key `'by_ope_id'`. Can we see rate by transfer institution?
#         'act_scores',
#         'sat_scores',
#         'admission_rate_suppressed'
#     ]
# )

In [19]:
data_results_latest_academics = data_results_latest.get("academics")
# data_results_latest_academics.keys()
# dict_keys(
#     [
#         'program',
#         'program_available',
#         'program_percentage', # Shows distribution of programs (use for %STEM)
#         'program_reporter'
#     ]
# )

In [37]:
data_results_latest_programs = data_results_latest.get("programs")
# len(data_results_latest_programs.get("cip_4_digit")) # 102

programs = data_results_latest_programs.get("cip_4_digit")

df_programs = pd.DataFrame(
    [
        (
            p.get("code"),
            p.get("title"),
            p.get("ope6_id"),
            p.get("credential").get("title"),
        )
        for p in programs
    ],
    columns=["cip_code", "title", "ope6_id", "credential"],
)

df_programs

Unnamed: 0,cip_code,title,ope6_id,credential
0,0106,Applied Horticulture and Horticultural Busines...,001292,Undergraduate Certificate or Diploma
1,0106,Applied Horticulture and Horticultural Busines...,001292,Associate's Degree
2,0502,"Ethnic, Cultural Minority, Gender, and Group S...",001292,Associate's Degree
3,0901,Communication and Media Studies.,001292,Associate's Degree
4,0904,Journalism.,001292,Associate's Degree
...,...,...,...,...
97,5217,Insurance.,001292,Undergraduate Certificate or Diploma
98,5217,Insurance.,001292,Associate's Degree
99,5218,"General Sales, Merchandising and Related Marke...",001292,Undergraduate Certificate or Diploma
100,5218,"General Sales, Merchandising and Related Marke...",001292,Associate's Degree


# Attempting to Gather Relevant Data

In [39]:
import os
import requests

url = "https://api.data.gov/ed/collegescorecard/v1/schools"

fields = [
    "school.name",
    "2018.admissions",
]

## From {year}.aid
# '{year}.aid.ftft_pell_grant_rate',
# '{year}.aid.ftft_federal_loan_rate',
# '{year}.aid.pell_grant_rate',
# '{year}.aid.federal_loan_rate',
# '{year}.aid.loan_principal',
# '{year}.aid.median_debt.income',
# '{year}.aid.median_debt.pell_grant',
# '{year}.aid.median_debt.no_pell_grant',
# '{year}.aid.median_debt.first_generation_students',
# '{year}.aid.median_debt.non_first_generation_students',

## From {year}.student.demographics
# "{year}.student.demographics.race_ethnicity.white",
# "{year}.student.demographics.race_ethnicity.black",
# "{year}.student.demographics.race_ethnicity.hispanic",
# "{year}.student.demographics.race_ethnicity.asian",
# "{year}.student.demographics.race_ethnicity.aian",
# "{year}.student.demographics.race_ethnicity.nhpi",
# "{year}.student.demographics.race_ethnicity.two_or_more",
# "{year}.student.demographics.race_ethnicity.non_resident_alien",
# "{year}.student.demographics.race_ethnicity.unknown",
# "{year}.student.demographics.men",
# "{year}.student.demographics.women",
# "{year}.student.demographics.student_faculty_ratio",
# "{year}.student.demographics.faculty.race_ethnicity.two_or_more",
# "{year}.student.demographics.faculty.race_ethnicity.aian",
# "{year}.student.demographics.faculty.race_ethnicity.asian",
# "{year}.student.demographics.faculty.race_ethnicity.black",
# "{year}.student.demographics.faculty.race_ethnicity.hispanic",
# "{year}.student.demographics.faculty.race_ethnicity.nhpi",
# "{year}.student.demographics.faculty.race_ethnicity.non_resident_alien",
# "{year}.student.demographics.faculty.race_ethnicity.unknown",
# "{year}.student.demographics.faculty.race_ethnicity.white",
# "{year}.student.demographics.faculty.men",
# "{year}.student.demographics.faculty.women",

## From {year}.student.enrollment
# "{year}.student.enrollment.undergrad_12_month",

## From {year}.cost
# '{year}.cost.avg_net_price.public',
# '{year}.cost.net_price.public.by_income_level.0-30000',
# '{year}.cost.net_price.public.by_income_level.30001-48000',
# '{year}.cost.net_price.public.by_income_level.48001-75000',
# '{year}.cost.net_price.public.by_income_level.75001-110000',
# '{year}.cost.net_price.public.by_income_level.110001-plus',
# '{year}.cost.title_iv.public.by_income_level.0-30000',
# '{year}.cost.title_iv.public.by_income_level.30001-48000',
# '{year}.cost.title_iv.public.by_income_level.48001-75000',
# '{year}.cost.title_iv.public.by_income_level.75001-110000',
# '{year}.cost.title_iv.public.by_income_level.110001-plus',
# '{year}.cost.attendance.academic_year',
# '{year}.cost.tuition.in_state',
# '{year}.cost.tuition.out_of_state',
# '{year}.cost.booksupply',
# '{year}.cost.roomboard.oncampus',
# '{year}.cost.otherexpense.oncampus',
# '{year}.cost.roomboard.offcampus',
# '{year}.cost.otherexpense.offcampus',
# '{year}.cost.otherexpense.withfamily',

## From {year}.academics
# '{year}.academics.program_percentage', # Shows distribution of programs (use for %STEM)

## From {year}.admissions
# '{year}.admissions.admission_rate.overall'

params = {
    "api_key": os.getenv("SCORECARDAPI"),
    "school.name": "University of California Davis",
    "fields": ",".join(fields)
}

response = requests.get(url, params=params)
print(response.json()["metadata"])
data = response.json()

{'page': 0, 'total': 1, 'per_page': 20}


In [40]:
data_results = data.get("results")[0]
data_results

{'2018.admissions.admission_rate.overall': 0.4107,
 '2018.admissions.admission_rate.by_ope_id': 0.4107,
 '2018.admissions.sat_scores.25th_percentile.critical_reading': 580,
 '2018.admissions.sat_scores.75th_percentile.critical_reading': 670,
 '2018.admissions.sat_scores.25th_percentile.math': 580,
 '2018.admissions.sat_scores.75th_percentile.math': 730,
 '2018.admissions.sat_scores.25th_percentile.writing': None,
 '2018.admissions.sat_scores.75th_percentile.writing': None,
 '2018.admissions.sat_scores.midpoint.critical_reading': 625,
 '2018.admissions.sat_scores.midpoint.math': 655,
 '2018.admissions.sat_scores.midpoint.writing': None,
 '2018.admissions.act_scores.25th_percentile.cumulative': 24,
 '2018.admissions.act_scores.75th_percentile.cumulative': 32,
 '2018.admissions.act_scores.25th_percentile.english': 23,
 '2018.admissions.act_scores.75th_percentile.english': 32,
 '2018.admissions.act_scores.25th_percentile.math': 23,
 '2018.admissions.act_scores.75th_percentile.math': 30,
 '

In [None]:
url = "https://api.data.gov/ed/collegescorecard/v1/schools"

fields = [
    "school.name",
    "ope6_id",
    "ope8_id",
    "fed_sch_cd"
]

params = {
    "api_key": os.getenv("SCORECARDAPI"),
    "ope6_id": ",".join(
        [
            # The 10 UC's:
            "001313",
            "001312",
            "001314",
            "001321",
            "001320",
            "041271",
            "001315",
            "001316",
            "001317",
            "001319",
        ]
    ),
    "fields": ",".join(fields),
    "per_page": 100,
}

response = requests.get(url, params=params)
print(response.json()["metadata"])
data = response.json()

{'page': 0, 'total': 10, 'per_page': 100}


In [76]:
# For CC's, passing the name to the API is not enough, since it uses fuzzy matching.
# Passing the OPE-ID would be best, but there is no reliable public source for these
# that disaggregates by California Community College. The CCCCO API gives college
# names and locations, so we can use those, with distance set to 1 mile.

# Get the CCC list:
cc_colleges_df = get_ccc_colleges()
cc_colleges_df

Unnamed: 0,CollegeID,CollegeName,DistrictID,StreetAddress,City,County,Zip,ZipPlus4,MailingAddress,MailingCity,MailingZip,Phone,WebsiteURL,Latitude,Longitude,LogoURL,District
0,021,Cuyamaca College,020,900 Rancho San Diego Parkway,El Cajon,San Diego,92019,4304,900 Rancho San Diego Parkway,El Cajon,92019,619.660.4000,www.cuyamaca.edu,32.744890,-116.935229,CuyamacaCollegeLogo.jpg,
1,022,Grossmont College,020,8800 Grossmont College Drive,El Cajon,San Diego,92020,1799,8800 Grossmont College Drive,El Cajon,92020,619.644.7000,www.grossmont.edu,32.817897,-117.005640,GrossmontCollegelogo.jpg,
2,031,Imperial Valley College,030,380 East Aten Road,Imperial,Imperial,92251,9787,380 East Aten Road,Imperial,92251,760.352.8320,www.imperial.edu,32.825859,-115.502999,ImperialValleyCollegeLogocopy.jpg,
3,051,MiraCosta College,050,1 Barnard Drive,Oceanside,San Diego,92056,3899,1 Barnard Drive,Oceanside,92056,760.757.2121,www.miracosta.edu,33.188864,-117.301064,Mira_Costa_College_Logo_4c.png,
4,061,Palomar College,060,1140 West Mission Road,San Marcos,San Diego,92069,1487,1140 West Mission Road,San Marcos,92069,760.744.1150,www.palomar.edu,33.147015,-117.183980,PalomarCollegeLogo.jpg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,963,Norco College,960,2001 Third Street,Norco,Riverside,92860,2600,2001 Third Street,Norco,92860,951.372.7000,www.norcocollege.edu,33.915421,-117.568755,NorcoCollegeLogo.jpg,
111,971,Copper Mountain College,970,6162 Rotary Way,Joshua Tree,San Bernardino,92252,6100,6162 Rotary Way,Joshua Tree,92252,760.366.3791,www.cmccd.edu,34.141518,-116.212214,CopperMtCollegelogo.jpg,
112,981,Crafton Hills College,980,11711 Sand Canyon Road,Yucaipa,San Bernardino,92399,1799,11711 Sand Canyon Road,Yucaipa,92399,909.794.2161,www.craftonhills.edu,34.041104,-117.107133,CraftonHillCollegeLogo.jpg,
113,982,San Bernardino Valley College,980,701 S. Mt. Vernon Avenue,San Bernardino,San Bernardino,92410,2798,701 S. Mt. Vernon Avenue,San Bernardino,92410,909.384.4400,www.valleycollege.edu,34.088099,-117.313704,SanBernardinologo.jpg,


In [114]:
def fetch_college_data(
        college_name: str,
        latitude: str | None = None,
        longitude: str | None = None,
        distance: int | None = None,
        fields = fields
    ):

    params = {
        "api_key": os.getenv("SCORECARDAPI"),
        "school.name": college_name,
        "school.state": "CA",
        "distance": distance,
        "fields": ",".join(fields),
        "per_page": 10,
    }

    if latitude:
        params["location.lat"] = latitude
    if longitude:
        params["location.lon"] = longitude
    if distance:
        params["distance"] = distance

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        results_list = data.get("results", [])
        results_df = pd.json_normalize(results_list, sep='.')
        return results_df
    except Exception as e:
        print(f"* Error processing {college_name}: {e}")
        return pd.DataFrame()

In [115]:
url = "https://api.data.gov/ed/collegescorecard/v1/schools"

fields = [
    "school.name",
    "ope6_id",
    "ope8_id",
    "fed_sch_cd"
]

cc_scorecard = pd.DataFrame()

for index, row in cc_colleges_df.iterrows():
    time.sleep(0.5)
    print(f"Processing {row['CollegeName']}...")
    college_data = fetch_college_data(
        college_name=row["CollegeName"],
        latitude=row["Latitude"],
        longitude=row["Longitude"],
        distance=1,
    )
    cc_scorecard = pd.concat([cc_scorecard, college_data], ignore_index=True)

    # There are two colleges (Cerritos College and Palo Verde College)
    # that return no results when lat/lon is provided. They both return unique
    # results when queried without lat/lon and distance.
    if college_data.empty:
        time.sleep(0.5)
        print(f"* College not found. Broadening search...")
        college_data = fetch_college_data(
            college_name=row["CollegeName"],
        )
        cc_scorecard = pd.concat(
            [
                cc_scorecard,
                college_data
            ], ignore_index=True
        )

Processing Cuyamaca College...
Processing Grossmont College...
Processing Imperial Valley College...
Processing MiraCosta College...
Processing Palomar College...
Processing San Diego City College...
Processing San Diego Mesa College...
Processing San Diego Miramar College...
Processing Southwestern College...
Processing Butte College...
Processing Feather River College...
Processing Lassen Community College...
Processing Mendocino College...
Processing College of the Redwoods...
Processing Shasta College...
Processing College of the Siskiyous...
Processing Lake Tahoe Community College...
Processing American River College...
Processing Cosumnes River College...
Processing Sacramento City College...
Processing Folsom Lake College...
Processing Napa Valley College...
Processing Santa Rosa Junior College...
Processing Sierra College...
Processing Solano Community College...
Processing Yuba College...
Processing Woodland Community College...
Processing Contra Costa College...
Processing Di

In [117]:
cc_scorecard.tail(30)

Unnamed: 0,school.name,ope6_id,ope8_id,fed_sch_cd
87,Pasadena City College,1261,126100,1261
88,Santa Monica College,1286,128600,1286
89,Cerritos College,1161,116100,1161
90,Citrus College,1166,116600,1166
91,Coastline Community College,20635,2063500,13536
92,Coastline Beauty College,40923,4092300,40923
93,Golden West College,1206,120600,1206
94,Orange Coast College,1250,125000,1250
95,Long Beach City College,1219,121900,1219
96,Mt San Antonio College,1245,124500,1245


# Explore the Code

In [38]:
alldata = get_scorecards_by_state()

{'page': 1, 'total': 672, 'per_page': 100}
{'page': 2, 'total': 672, 'per_page': 100}
{'page': 3, 'total': 672, 'per_page': 100}
{'page': 4, 'total': 672, 'per_page': 100}
{'page': 5, 'total': 672, 'per_page': 100}
{'page': 6, 'total': 672, 'per_page': 100}
{'page': 7, 'total': 672, 'per_page': 100}


In [101]:
college1 = alldata[176]
college1.get("school").get("name")

'Solano Community College'

In [102]:
college1.keys()

dict_keys(['latest', 'school', 'location', 'id', 'ope6_id', 'ope8_id', 'fed_sch_cd'])

In [103]:
latest: dict = college1.get("latest")
latest.keys()

dict_keys(['school', 'student', 'cost', 'aid', 'earnings', 'completion', 'repayment', 'admissions', 'academics', 'programs'])

In [104]:
latest.get("school")

{'zip': '94534-3197',
 'city': 'Fairfield',
 'name': 'Solano Community College',
 'alias': 'Solano College',
 'state': 'CA',
 'locale': 22,
 'address': '4000 Suisun Valley Rd',
 'dolflag': 1,
 'branches': 1,
 'men_only': 0,
 'operating': 1,
 'ownership': 1,
 'region_id': 8,
 'accreditor': 'Western Association of Schools and Colleges Accrediting Commission for Community and Junior Colleges',
 'school_url': 'welcome.solano.edu/',
 'state_fips': 6,
 'women_only': 0,
 'main_campus': 1,
 'online_only': 0,
 'endowment': {'end': None, 'begin': None},
 'carnegie_basic': 14,
 'faculty_salary': 11108,
 'ownership_peps': 1,
 'peps_ownership': 'Public',
 'accreditor_code': 'WASCJC',
 'ft_faculty_rate': 0.4,
 'sector': {'scorecard': 7},
 'carnegie_undergrad': 5,
 'degree_urbanization': None,
 'under_investigation': 0,
 'price_calculator_url': 'https://misweb.cccco.edu/npc/281/npcalc.htm',
 'carnegie_size_setting': 12,
 'minority_serving': {'annh': 0,
  'nant': 0,
  'aanipi': 1,
  'tribal': 0,
  'hi

In [114]:
latest.get("completion")

{'consumer_rate': 0.25,
 'pooled_yrs_used': 2,
 'pooled_yrs_used_100': None,
 'pooled_yrs_used_200': None,
 'title_iv': {'died_by': {'2yrs': None,
   '3yrs': None,
   '4yrs': None,
   '6yrs': None,
   '8yrs': None},
  'unknown_by': {'2yrs': None,
   '3yrs': 0.01220657277,
   '4yrs': 0.008986928105,
   '6yrs': 0.007589025102,
   '8yrs': 0.219217491369},
  'completed_by': {'2yrs': 0.046560846561,
   '3yrs': 0.063849765258,
   '4yrs': 0.047385620915,
   '6yrs': 0.065966141273,
   '8yrs': 0.052934407365},
  'male': {'died_by': {'2yrs': None,
    '3yrs': None,
    '4yrs': None,
    '6yrs': None,
    '8yrs': None},
   'unknown_by': {'2yrs': None,
    '3yrs': None,
    '4yrs': None,
    '6yrs': None,
    '8yrs': 0.29003021148},
   'completed_by': {'2yrs': 0.045180722892,
    '3yrs': 0.058823529412,
    '4yrs': 0.029213483146,
    '6yrs': 0.045662100457,
    '8yrs': 0.030211480363},
   'withdrawn_by': {'2yrs': 0.539156626506,
    '3yrs': 0.510695187166,
    '4yrs': 0.519101123596,
    '6yrs': 

In [106]:
# OPEID
college1.get("ope8_id")

'00129200'

# Merging

In [3]:
cc = get_ccc_colleges()

In [17]:
cc.head()

Unnamed: 0,CollegeID,CollegeName,DistrictID,StreetAddress,City,County,Zip,ZipPlus4,MailingAddress,MailingCity,MailingZip,Phone,WebsiteURL,Latitude,Longitude,LogoURL,District
0,21,Cuyamaca College,20,900 Rancho San Diego Parkway,El Cajon,San Diego,92019,4304,900 Rancho San Diego Parkway,El Cajon,92019,619.660.4000,www.cuyamaca.edu,32.74489,-116.935229,CuyamacaCollegeLogo.jpg,
1,22,Grossmont College,20,8800 Grossmont College Drive,El Cajon,San Diego,92020,1799,8800 Grossmont College Drive,El Cajon,92020,619.644.7000,www.grossmont.edu,32.817897,-117.00564,GrossmontCollegelogo.jpg,
2,31,Imperial Valley College,30,380 East Aten Road,Imperial,Imperial,92251,9787,380 East Aten Road,Imperial,92251,760.352.8320,www.imperial.edu,32.825859,-115.502999,ImperialValleyCollegeLogocopy.jpg,
3,51,MiraCosta College,50,1 Barnard Drive,Oceanside,San Diego,92056,3899,1 Barnard Drive,Oceanside,92056,760.757.2121,www.miracosta.edu,33.188864,-117.301064,Mira_Costa_College_Logo_4c.png,
4,61,Palomar College,60,1140 West Mission Road,San Marcos,San Diego,92069,1487,1140 West Mission Road,San Marcos,92069,760.744.1150,www.palomar.edu,33.147015,-117.18398,PalomarCollegeLogo.jpg,


In [5]:
cc.shape

(115, 17)

In [28]:
cc_scorecard = pd.DataFrame()

for _, row in cc.iterrows():
    college_name = row["CollegeName"]
    college_city = row["City"]
    time.sleep(0.25)
    print(f"Trying {college_name} in {college_city}")
    try:
        scorecard_data = get_scorecard_by_college(
            college_name = college_name,
            college_city = college_city,
            college_state = "CA"
        )
    except Exception as e:
        print(f"  Error with college {college_name} in {college_city}: {e}")
        print(f"  Retrying with just college name and state.")
        scorecard_data = get_scorecard_by_college(
            college_name = college_name,
            college_state = "CA"
        )
    if not scorecard_data.empty:
        # Track which CCCCO API college this data came from.
        scorecard_data["source_college_name"] = college_name
        cc_scorecard = pd.concat(
            [cc_scorecard, scorecard_data],
            ignore_index=True
        )

print(f"Shape: {cc_scorecard.shape}")
cc_scorecard.head()

Trying Cuyamaca College in El Cajon
Trying Grossmont College in El Cajon
Trying Imperial Valley College in Imperial
Trying MiraCosta College in Oceanside
Trying Palomar College in San Marcos
Trying San Diego City College in San Diego
Trying San Diego Mesa College in San Diego
Trying San Diego Miramar College in San Diego
Trying Southwestern College in Chula Vista
Trying Butte College in Oroville
Trying Feather River College in Quincy
Trying Lassen Community College in Susanville
Trying Mendocino College in Ukiah
Trying College of the Redwoods in Eureka
Trying Shasta College in Redding
Trying College of the Siskiyous in Weed
Trying Lake Tahoe Community College in So. Lake Tahoe
  Error with college Lake Tahoe Community College in So. Lake Tahoe: "None of [Index(['school_name'], dtype='object')] are in the [columns]"
  Retrying with just college name and state.
Trying American River College in Sacramento
Trying Cosumnes River College in Sacramento
Trying Sacramento City College in Sacram

Unnamed: 0,school_name,latest_student_size,latest_student_enrollment_undergrad_12_month,latest_student_demographics_over_23_at_entry,latest_student_demographics_first_generation,latest_student_demographics_median_hh_income,latest_student_demographics_student_faculty_ratio,latest_student_FAFSA_applications,source_college_name
0,Cuyamaca College,6788.0,12859.0,0.47,0.610113,61734.0,25.0,2901.0,Cuyamaca College
1,Grossmont College,11467.0,18592.0,0.39,0.550389,60424.0,23.0,5463.0,Grossmont College
2,Imperial Valley College,6881.0,10716.0,0.26,0.609207,45460.0,33.0,3798.0,Imperial Valley College
3,MiraCosta College,10474.0,16640.0,0.43,0.509714,68755.0,22.0,4602.0,MiraCosta College
4,Palomar College,16167.0,25877.0,0.44,0.538365,68227.0,24.0,5316.0,Palomar College


In [27]:
cc_scorecard.shape

(137, 9)

In [30]:
cc_scorecard["source_college_name"].value_counts()

source_college_name
Bakersfield College        4
Santa Ana College          3
Fullerton College          2
Shasta College             2
Ventura College            2
                          ..
Berkeley City College      1
Merritt College            1
Laney College              1
College of Alameda         1
College of the Sequoias    1
Name: count, Length: 115, dtype: int64

In [None]:
# returns rows belonging to source_college_name groups with count > 1
dups = cc_scorecard.groupby("source_college_name").filter(lambda g: len(g) > 1)
dups

Unnamed: 0,school_name,latest_student_size,latest_student_enrollment_undergrad_12_month,latest_student_demographics_over_23_at_entry,latest_student_demographics_first_generation,latest_student_demographics_median_hh_income,latest_student_demographics_student_faculty_ratio,latest_student_FAFSA_applications,source_college_name
14,Shasta Bible College and Graduate School,12.0,35.0,0.26,,54363.0,3.0,22.0,Shasta College
15,Shasta College,5536.0,11483.0,0.48,0.526534,46241.0,26.0,3202.0,Shasta College
53,Glendale Career College-North-West College-Bak...,387.0,398.0,0.63,0.551724,58741.0,28.0,934.0,Bakersfield College
54,Bakersfield College,16451.0,39530.0,0.36,0.630508,46274.0,32.0,8128.0,Bakersfield College
55,San Joaquin Valley College-Bakersfield,820.0,1285.0,0.49,0.590503,51804.0,19.0,8837.0,Bakersfield College
56,UEI College-Bakersfield,1385.0,2447.0,0.6,0.640536,46280.0,21.0,3361.0,Bakersfield College
58,Porterville College,2775.0,5624.0,0.35,0.630058,42464.0,19.0,1845.0,Porterville College
59,San Joaquin Valley College-Porterville,98.0,124.0,0.49,0.590503,51804.0,49.0,8837.0,Porterville College
78,Ventura College,9971.0,16174.0,0.36,0.566563,68802.0,27.0,3326.0,Ventura College
79,The Colleges of Law at Ventura,,,,,,,,Ventura College


In [15]:
for school in cc_scorecard["school_name"].unique():
    print(school)

Cuyamaca College
Grossmont College
Imperial Valley College
MiraCosta College
MiraCosta College - San Elijo Campus
Palomar College
San Diego City College
San Diego Mesa College
San Diego Miramar College
Southwestern College
Northwestern College
Southwestern Illinois College
Florida SouthWestern State College
Southwestern Community College
Southwestern Michigan College
Southwestern Oregon Community College
Southwestern Christian College
Butte College
Feather River Community College District
Lassen Community College
Mendocino College
College of the Redwoods
Shasta Bible College and Graduate School
Shasta College
College of the Siskiyous
Lake Tahoe Community College
American River College
Cosumnes River College
Sacramento City College
Folsom Lake College
Napa Valley College
Santa Rosa Junior College
Sierra College
Sierra College of Beauty
Solano Community College
Yuba College
Woodland Community College
Contra Costa College
Contra Costa Medical Career College
Diablo Valley College
Los Medan