# DataFrames Basics

## Prerrequisites

Install Spark and Java in VM

In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [None]:
ls -l # check the .tgz is there

total 391016
drwxr-xr-x 1 root root      4096 Jan 10 14:23 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz


In [None]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [None]:
!pip install -q findspark

In [None]:

!pip install py4j

# For maps
!pip install folium
!pip install plotly



Define the environment

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [None]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("DataFrames Basics") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [None]:
spark

In [None]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [None]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/cars.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/movies.json -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/bank.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/vehicles.csv -P /dataset
!ls /dataset

bank.csv  cars.json  movies.json  vehicles.csv


In [None]:
ls -l /dataset

total 1784
-rw-r--r-- 1 root root  461474 Jan 11 17:50 bank.csv
-rw-r--r-- 1 root root   74910 Jan 11 17:50 cars.json
-rw-r--r-- 1 root root 1274347 Jan 11 17:50 movies.json
-rw-r--r-- 1 root root    4370 Jan 11 17:50 vehicles.csv


In [34]:
drugDeathsDF = spark.read.option('header', 'true').option('delimitter', ',').option('inferSchema', 'true').csv('dataset/drug_deaths.csv')

In [33]:
drugDeathsDF.show()

+----------+-------------+--------------------+--------+----+------+------------+-------------+---------------+--------------+---------+-----------+---------+---------------+--------------------+-----------+----------+------------+-----------+--------------------+---------------+------+-------+--------+-----------------+---------+-----------+-------+-----------+--------------+---------+------+------+------------------+-------------+-----+---------+---------+-------------+-------------+----------------+-------------+
|       _c0|           ID|                Date|DateType| Age|   Sex|        Race|ResidenceCity|ResidenceCounty|ResidenceState|DeathCity|DeathCounty| Location|LocationifOther| DescriptionofInjury|InjuryPlace|InjuryCity|InjuryCounty|InjuryState|                 COD|OtherSignifican|Heroin|Cocaine|Fentanyl|Fentanyl_Analogue|Oxycodone|Oxymorphone|Ethanol|Hydrocodone|Benzodiazepine|Methadone|Amphet|Tramad|Morphine_NotHeroin|Hydromorphone|Other|OpiateNOS|AnyOpioid|MannerofDeath| D

In [37]:
DF1 = drugDeathsDF.groupBy('Race').agg(count('Sex')).orderBy(desc('count(Sex)'))
DF1.show()

+--------------------+----------+
|                Race|count(Sex)|
+--------------------+----------+
|               White|      4002|
|     Hispanic, White|       560|
|               Black|       433|
|     Hispanic, Black|        24|
|             Unknown|        23|
|        Asian, Other|        18|
|        Asian Indian|        14|
|               Other|        11|
|                NULL|        10|
|             Chinese|         2|
|            Hawaiian|         1|
|Native American, ...|         1|
+--------------------+----------+



In [31]:
DF2 = drugDeathsDF.select('Age', 'ResidenceCity')
EdadDF = DF2.groupBy('ResidenceCity').agg(avg('Age').alias('media_edad')).orderBy(desc(avg('Age')))

EdadDF.show()


+-----------------+----------+
|    ResidenceCity|media_edad|
+-----------------+----------+
|ARLINGTON HEIGHTS|      72.0|
|   ALFRED STATION|      65.0|
|        WELLESLEY|      64.0|
|     NORTH WINDAM|      64.0|
|            SAKEM|      63.0|
|          SEBRING|      62.0|
|    OLD GREENWICH|      59.0|
|           NAPLES|      59.0|
|       SOUTH LYME|      59.0|
|          JACKSON|      59.0|
|   EAST WOODSTOCK|      58.0|
|           ROSCOE|      58.0|
|        CHEPACHET|      58.0|
|       WASHINGTON|      57.5|
|        SOUTHPORT|      57.0|
|           NUTLEY|      57.0|
|          CHELSEA|      57.0|
|        BLANDFORD|      57.0|
|         ROCKFALL|      57.0|
|            CHASE|      56.0|
+-----------------+----------+
only showing top 20 rows



In [39]:
personaDF = drugDeathsDF.groupBy('Sex').agg(count('ID').alias('Sex_count')).orderBy(desc(count('ID')))
personaDF.show()

+-------+---------+
|    Sex|Sex_count|
+-------+---------+
|   NULL|    15150|
|   Male|     3773|
| Female|     1325|
|Unknown|        1|
+-------+---------+

