In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
0% [Waiting for headers] [1 InRelease 0 B/114 kB 0%] [Connected to cloud.r-proj                                                                               Get:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
0% [Waiting for headers] [1 InRelease 0 B/114 kB 0%] [Connected to cloud.r-proj0% [Waiting for headers] [1 InRelease 114 kB/114 kB 100%] [Connected to cloud.r                                                                               Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
                                                                               Get:4 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
                                                                               Get:5 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
                                     

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("demographicsFilter").getOrCreate()

In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/1/demographics.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("demographics.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+---+-----------------+---+--------+---------+--------+--------------------+---------------+------+--------------------+
| id|             name|age|height_m|weight_kg|children|          occupation|academic_degree|salary|            location|
+---+-----------------+---+--------+---------+--------+--------------------+---------------+------+--------------------+
|  1|    Glad Gavrieli| 38|    1.52|       74|       0|Computer Systems ...|       Bachelor|    78|           Louisiana|
|  2|  Henrieta Fittes| 34|    1.72|       39|       4|             Teacher|         Master|    44|            Illinois|
|  3|   Peyton Dulanty| 24|    1.80|       47|       5|Senior Quality En...|            PhD|    44|      North Carolina|
|  4|     Denna Morgen| 48|    1.81|       71|       5|   Account Executive|         Master|    81|          California|
|  5|    Camella Izaks| 34|    1.65|       60|       1|   Director of Sales|            PhD|    76|                Ohio|
|  6|     Shara Esposi| 49|    1

In [4]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- height_m: string (nullable = true)
 |-- weight_kg: string (nullable = true)
 |-- children: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- academic_degree: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- location: string (nullable = true)



In [5]:
# You will have to convert the data types of the following columns when you use Colab.
from pyspark.sql.types import IntegerType, DoubleType
df = df.withColumn("id", df["id"].cast(IntegerType()))
df = df.withColumn("age", df["age"].cast(IntegerType()))
df = df.withColumn("weight_kg", df["weight_kg"].cast(IntegerType()))
df = df.withColumn("children", df["children"].cast(IntegerType()))
df = df.withColumn("salary", df["salary"].cast(IntegerType()))
df = df.withColumn("height_m", df["height_m"].cast(DoubleType()))


In [6]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height_m: double (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- children: integer (nullable = true)
 |-- occupation: string (nullable = true)
 |-- academic_degree: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- location: string (nullable = true)



In [7]:
# What occupation had the highest salary?
df.orderBy(df["Salary"].desc()).select("occupation", "Salary").limit(1).show()

+-----------------+------+
|       occupation|Salary|
+-----------------+------+
|Chemical Engineer|   120|
+-----------------+------+



In [8]:
# What occupation had the lowest salary?
df.orderBy(df["Salary"]).select("occupation", "Salary").limit(1).show()

+----------------+------+
|      occupation|Salary|
+----------------+------+
|Product Engineer|    40|
+----------------+------+



In [9]:
# What is the mean salary of this dataset?
from pyspark.sql.functions import mean
df.select(mean("Salary")).show()

+-----------+
|avg(Salary)|
+-----------+
|     79.475|
+-----------+



In [10]:
# What is the max and min of the Salary column?
from pyspark.sql.functions import max, min
df.select(max("Salary"), min("Salary")).show()

+-----------+-----------+
|max(Salary)|min(Salary)|
+-----------+-----------+
|        120|         40|
+-----------+-----------+



In [11]:
# Show all of the occupations where salaries were above 80k
from pyspark.sql.functions import count
df.filter("Salary > 80").select("occupation").show()

+--------------------+
|          occupation|
+--------------------+
|   Account Executive|
|Sales Representative|
|        Developer II|
|     Design Engineer|
|     Project Manager|
|       Programmer II|
|Sales Representative|
|Physical Therapy ...|
|Payment Adjustmen...|
|Mechanical System...|
|     Media Manager I|
|   Account Executive|
|           Professor|
|Community Outreac...|
| Clinical Specialist|
|Human Resources A...|
|Nuclear Power Eng...|
|      Civil Engineer|
|Human Resources M...|
|Senior Cost Accou...|
+--------------------+
only showing top 20 rows



In [12]:
# BONUS
# What is the average age and height for each academic degree type?
# HINT: You will need to use `groupby` to solve this
avg_df = df.groupBy("academic_degree").avg()
avg_df.select("academic_degree", "avg(age)", "avg(height_m)").show()

+---------------+------------------+------------------+
|academic_degree|          avg(age)|     avg(height_m)|
+---------------+------------------+------------------+
|            PhD| 42.87818696883853|1.7537393767705372|
|         Master|42.105095541401276|1.7606050955414014|
|       Bachelor| 43.85585585585586|1.7371771771771771|
+---------------+------------------+------------------+

