In [1]:
/*
* Author: Mike Urciuoli
* email: urciuolim@gmail.com
* description: The purpose of this notebook is to show how to clean the Census educational attainment dataset for
*              usage in a county prediction model. The overall goal of the project is to predict some feature
*              of a county based on other input features. The dataset is split into files by year, and each year the
*              overall schema may change, so cleaning each individual year seems to be best. Ultimately we want educational
*              attainment for two age groups, where each attainment level is represented by a percentage.
*/

import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.round
val pop_18_24 = "Pop_18_24"
val per_less_hs_grad_18_24 = "Less_HS_Grad_18_24"
val per_hs_grad_18_24 = "HS_Grad_18_24"
val per_some_college_18_24 = "Some_College_18_24"
val per_bachelor_18_24 = "Bachelor_18_24"
val pop_25 = "Pop_25plus"
val per_less_9th_25 = "Less_9th_25plus"
val per_9th_12th_25 = "9_12th_25plus"
val per_hs_25 = "HS_Grad_25plus"
val per_some_college_25 = "Some_College_25plus"
val per_assoc_25 = "Associate_25plus"
val per_bach_25 = "Bachelor_25plus"
val per_grad_25 = "Gradplus_25plus"

val eduParsed11 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2011.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2011)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        $"S1501_C01_002E".as(per_less_hs_grad_18_24).cast("Double"),
        $"S1501_C01_003E".as(per_hs_grad_18_24).cast("Double"),
        $"S1501_C01_004E".as(per_some_college_18_24).cast("Double"),
        $"S1501_C01_005E".as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        $"S1501_C01_007E".as(per_less_9th_25).cast("Double"),
        $"S1501_C01_008E".as(per_9th_12th_25).cast("Double"),
        $"S1501_C01_009E".as(per_hs_25).cast("Double"),
        $"S1501_C01_010E".as(per_some_college_25).cast("Double"),
        $"S1501_C01_011E".as(per_assoc_25).cast("Double"),
        $"S1501_C01_012E".as(per_bach_25).cast("Double"),
        $"S1501_C01_013E".as(per_grad_25).cast("Double")
    ).na.fill(0) // Rows with null values seem to be where population = 0 (and percentages are NaN)
eduParsed11.show
// Use these lines to search for null values in rows
// eduRaw11.count
// val noNA = eduRaw11.na.drop
// eduRaw11.unionAll(noNA).except(eduRaw11.intersect(noNA)).show

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
15,application_1598467819421_0016,spark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.round
pop_18_24: String = Pop_18_24
per_less_hs_grad_18_24: String = Less_HS_Grad_18_24
per_hs_grad_18_24: String = HS_Grad_18_24
per_some_college_18_24: String = Some_College_18_24
per_bachelor_18_24: String = Bachelor_18_24
pop_25: String = Pop_25plus
per_less_9th_25: String = Less_9th_25plus
per_9th_12th_25: String = 9_12th_25plus
per_hs_25: String = HS_Grad_25plus
per_some_college_25: String = Some_College_25plus
per_assoc_25: String = Associate_25plus
per_bach_25: String = Bachelor_25plus
per_grad_25: String = Gradplus_25plus
eduParsed11: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_2

In [2]:
val eduParsed12 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2012.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2012)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        $"S1501_C01_002E".as(per_less_hs_grad_18_24).cast("Double"),
        $"S1501_C01_003E".as(per_hs_grad_18_24).cast("Double"),
        $"S1501_C01_004E".as(per_some_college_18_24).cast("Double"),
        $"S1501_C01_005E".as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        $"S1501_C01_007E".as(per_less_9th_25).cast("Double"),
        $"S1501_C01_008E".as(per_9th_12th_25).cast("Double"),
        $"S1501_C01_009E".as(per_hs_25).cast("Double"),
        $"S1501_C01_010E".as(per_some_college_25).cast("Double"),
        $"S1501_C01_011E".as(per_assoc_25).cast("Double"),
        $"S1501_C01_012E".as(per_bach_25).cast("Double"),
        $"S1501_C01_013E".as(per_grad_25).cast("Double")
    ).na.fill(0)
eduParsed12.show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

eduParsed12: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
|05105|2012|      817|              20.9|         38.3|              36.5|           4.3|      7253|            6.7|         11.3|          44.3|               23.1|             4.8|            6.5|            3.2|
|05107|2012|     1953|              37.4|        

In [3]:
val eduParsed13 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2013.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2013)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        $"S1501_C01_002E".as(per_less_hs_grad_18_24).cast("Double"),
        $"S1501_C01_003E".as(per_hs_grad_18_24).cast("Double"),
        $"S1501_C01_004E".as(per_some_college_18_24).cast("Double"),
        $"S1501_C01_005E".as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        $"S1501_C01_007E".as(per_less_9th_25).cast("Double"),
        $"S1501_C01_008E".as(per_9th_12th_25).cast("Double"),
        $"S1501_C01_009E".as(per_hs_25).cast("Double"),
        $"S1501_C01_010E".as(per_some_college_25).cast("Double"),
        $"S1501_C01_011E".as(per_assoc_25).cast("Double"),
        $"S1501_C01_012E".as(per_bach_25).cast("Double"),
        $"S1501_C01_013E".as(per_grad_25).cast("Double")
    ).na.fill(0)
eduParsed13.show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

eduParsed13: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
|13121|2013|   101833|              15.1|         22.7|              48.0|          14.2|    622408|            3.6|          6.1|          18.3|               18.3|             5.5|           29.7|           18.5|
|13123|2013|     2056|              26.6|        

In [4]:
val eduParsed14 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2014.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2014)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        $"S1501_C01_002E".as(per_less_hs_grad_18_24).cast("Double"),
        $"S1501_C01_003E".as(per_hs_grad_18_24).cast("Double"),
        $"S1501_C01_004E".as(per_some_college_18_24).cast("Double"),
        $"S1501_C01_005E".as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        $"S1501_C01_007E".as(per_less_9th_25).cast("Double"),
        $"S1501_C01_008E".as(per_9th_12th_25).cast("Double"),
        $"S1501_C01_009E".as(per_hs_25).cast("Double"),
        $"S1501_C01_010E".as(per_some_college_25).cast("Double"),
        $"S1501_C01_011E".as(per_assoc_25).cast("Double"),
        $"S1501_C01_012E".as(per_bach_25).cast("Double"),
        $"S1501_C01_013E".as(per_grad_25).cast("Double")
    ).na.fill(0)
eduParsed14.show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

eduParsed14: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
|13161|2014|     1290|              41.4|         26.4|              24.3|           7.8|      9602|            8.3|         12.8|          42.8|               18.5|             8.7|            4.9|            4.1|
|13163|2014|     1618|              29.7|        

In [5]:
// For some reason this year has totals listed for values that in the past have been percents
// i.e. column S1501_C01_002E used to be a % of the 18-24 population, now it is a total #
// minor change needed to select statement but more or less the same
val eduParsed15 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2015.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2015)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        (round(($"S1501_C01_002E"/$"S1501_C01_001E")*lit(100), 1)).as(per_less_hs_grad_18_24).cast("Double"),
        (round(($"S1501_C01_003E"/$"S1501_C01_001E")*lit(100), 1)).as(per_hs_grad_18_24).cast("Double"),
        (round(($"S1501_C01_004E"/$"S1501_C01_001E")*lit(100), 1)).as(per_some_college_18_24).cast("Double"),
        (round(($"S1501_C01_005E"/$"S1501_C01_001E")*lit(100), 1)).as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        (round(($"S1501_C01_007E"/$"S1501_C01_006E")*lit(100), 1)).as(per_less_9th_25).cast("Double"),
        (round(($"S1501_C01_008E"/$"S1501_C01_006E")*lit(100), 1)).as(per_9th_12th_25).cast("Double"),
        (round(($"S1501_C01_009E"/$"S1501_C01_006E")*lit(100), 1)).as(per_hs_25).cast("Double"),
        (round(($"S1501_C01_010E"/$"S1501_C01_006E")*lit(100), 1)).as(per_some_college_25).cast("Double"),
        (round(($"S1501_C01_011E"/$"S1501_C01_006E")*lit(100), 1)).as(per_assoc_25).cast("Double"),
        (round(($"S1501_C01_012E"/$"S1501_C01_006E")*lit(100), 1)).as(per_bach_25).cast("Double"),
        (round(($"S1501_C01_013E"/$"S1501_C01_006E")*lit(100), 1)).as(per_grad_25).cast("Double")
    ).na.fill(0)
eduParsed15.show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

eduParsed15: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
|01025|2015|     2188|              23.5|         55.9|              19.5|           1.1|     17128|            4.7|         14.7|          43.9|               16.7|             7.0|            8.4|            4.5|
|01027|2015|     1027|              18.9|        

In [6]:
val eduParsed16 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2016.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2016)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        (round(($"S1501_C01_002E"/$"S1501_C01_001E")*lit(100), 1)).as(per_less_hs_grad_18_24).cast("Double"),
        (round(($"S1501_C01_003E"/$"S1501_C01_001E")*lit(100), 1)).as(per_hs_grad_18_24).cast("Double"),
        (round(($"S1501_C01_004E"/$"S1501_C01_001E")*lit(100), 1)).as(per_some_college_18_24).cast("Double"),
        (round(($"S1501_C01_005E"/$"S1501_C01_001E")*lit(100), 1)).as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        (round(($"S1501_C01_007E"/$"S1501_C01_006E")*lit(100), 1)).as(per_less_9th_25).cast("Double"),
        (round(($"S1501_C01_008E"/$"S1501_C01_006E")*lit(100), 1)).as(per_9th_12th_25).cast("Double"),
        (round(($"S1501_C01_009E"/$"S1501_C01_006E")*lit(100), 1)).as(per_hs_25).cast("Double"),
        (round(($"S1501_C01_010E"/$"S1501_C01_006E")*lit(100), 1)).as(per_some_college_25).cast("Double"),
        (round(($"S1501_C01_011E"/$"S1501_C01_006E")*lit(100), 1)).as(per_assoc_25).cast("Double"),
        (round(($"S1501_C01_012E"/$"S1501_C01_006E")*lit(100), 1)).as(per_bach_25).cast("Double"),
        (round(($"S1501_C01_013E"/$"S1501_C01_006E")*lit(100), 1)).as(per_grad_25).cast("Double")
    ).na.fill(0)
eduParsed16.show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

eduParsed16: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
|01131|2016|     1080|              22.3|         44.8|              30.0|           2.9|      7441|            6.3|         13.8|          44.9|               17.4|             6.3|            7.3|            4.0|
|01133|2016|     1699|              23.5|        

In [7]:
val eduParsed17 = spark.read.option("header", "true").
    csv("s3://agimodeltrainer/DATA/EDU/ACSST5Y2017.csv").
    filter($"GEO_ID" =!= "id"). // Removes 2nd header line
    withColumn("FIPS", substring($"GEO_ID", 10, 5)).
    withColumn("Year", lit(2017)).
    select(
        $"FIPS",
        $"Year".cast("Int"),
        $"S1501_C01_001E".as(pop_18_24).cast("Int"),
        $"S1501_C02_002E".as(per_less_hs_grad_18_24).cast("Double"),
        $"S1501_C02_003E".as(per_hs_grad_18_24).cast("Double"),
        $"S1501_C02_004E".as(per_some_college_18_24).cast("Double"),
        $"S1501_C02_005E".as(per_bachelor_18_24).cast("Double"),
        $"S1501_C01_006E".as(pop_25).cast("Int"),
        $"S1501_C02_007E".as(per_less_9th_25).cast("Double"),
        $"S1501_C02_008E".as(per_9th_12th_25).cast("Double"),
        $"S1501_C02_009E".as(per_hs_25).cast("Double"),
        $"S1501_C02_010E".as(per_some_college_25).cast("Double"),
        $"S1501_C02_011E".as(per_assoc_25).cast("Double"),
        $"S1501_C02_012E".as(per_bach_25).cast("Double"),
        $"S1501_C02_013E".as(per_grad_25).cast("Double")
    ).na.fill(0)
eduParsed17.show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

eduParsed17: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
|01005|2017|     2253|              25.9|         42.3|              28.6|           3.2|     18434|            8.5|         18.4|          35.5|               18.2|             7.3|            7.6|            4.4|
|01007|2017|     1993|              21.7|        

In [8]:
val toAdd = Array(eduParsed12, eduParsed13, eduParsed14, eduParsed15, eduParsed16, eduParsed17)
var eduParsedTotal = eduParsed11
for (edu <- toAdd) { eduParsedTotal = eduParsedTotal.union(edu) }
eduParsedTotal.orderBy(rand(1234L)).show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

toAdd: Array[org.apache.spark.sql.DataFrame] = Array([FIPS: string, Year: int ... 13 more fields], [FIPS: string, Year: int ... 13 more fields], [FIPS: string, Year: int ... 13 more fields], [FIPS: string, Year: int ... 13 more fields], [FIPS: string, Year: int ... 13 more fields], [FIPS: string, Year: int ... 13 more fields])
eduParsedTotal: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop_18_24|Less_HS_Grad_18_24|HS_Grad_18_24|Some_College_18_24|Bachelor_18_24|Pop_25plus|Less_9th_25plus|9_12th_25plus|HS_Grad_25plus|Some_College_25plus|Associate_25plus|Bachelor_25plus|Gradplus_25plus|
+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+--

In [9]:
// Description of each of the above columns. A few points:
// 1. There is at least one business in each county, as indicated by the min value in the Total column.
// 2. The average number of businesses for each category of number of employees (1-4 => 1-4 Employees)
//    shrinks as number of employees increases.
// These numbers are based on all NAICS codes, and we could get more specific descriptions for a specifc code if we wanted to.

val cols = eduParsedTotal.columns
// Description show in two halves for formatting reasons
eduParsedTotal.describe(cols.slice(0, cols.size/2):_*).show
eduParsedTotal.describe(cols.slice(cols.size/2, cols.size):_*).show

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

cols: Array[String] = Array(FIPS, Year, Pop_18_24, Less_HS_Grad_18_24, HS_Grad_18_24, Some_College_18_24, Bachelor_18_24, Pop_25plus, Less_9th_25plus, 9_12th_25plus, HS_Grad_25plus, Some_College_25plus, Associate_25plus, Bachelor_25plus, Gradplus_25plus)
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+
|summary|              FIPS|              Year|         Pop_18_24|Less_HS_Grad_18_24|    HS_Grad_18_24|Some_College_18_24|   Bachelor_18_24|
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+
|  count|             22543|             22543|             22543|             22543|            22543|             22543|            22543|
|   mean|  31396.3048396398| 2013.999733841991|  9761.21146253826|18.841347646719605|35.02992059619393|40.059424211506915|6.060258173268865|
| stddev|16290.446102103326|2.0000665236

In [12]:
eduParsedTotal.write.parquet("s3://agimodeltrainer/Clean_Data/Educational_Attainment.parquet")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
// Quick test to show that data was written successfully.
val test = spark.read.parquet("s3://agimodeltrainer/Clean_Data/Educational_Attainment.parquet")
test.printSchema
test.show
test.count
test.select("FIPS").distinct.count
test.select("Year").distinct.count

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

test: org.apache.spark.sql.DataFrame = [FIPS: string, Year: int ... 13 more fields]
root
 |-- FIPS: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Pop_18_24: integer (nullable = true)
 |-- Less_HS_Grad_18_24: double (nullable = true)
 |-- HS_Grad_18_24: double (nullable = true)
 |-- Some_College_18_24: double (nullable = true)
 |-- Bachelor_18_24: double (nullable = true)
 |-- Pop_25plus: integer (nullable = true)
 |-- Less_9th_25plus: double (nullable = true)
 |-- 9_12th_25plus: double (nullable = true)
 |-- HS_Grad_25plus: double (nullable = true)
 |-- Some_College_25plus: double (nullable = true)
 |-- Associate_25plus: double (nullable = true)
 |-- Bachelor_25plus: double (nullable = true)
 |-- Gradplus_25plus: double (nullable = true)

+-----+----+---------+------------------+-------------+------------------+--------------+----------+---------------+-------------+--------------+-------------------+----------------+---------------+---------------+
| FIPS|Year|Pop