# Columns and Expressions

## Prerrequisites

Install Spark and Java in VM

In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

W: Operation was interrupted before it could finish
^C


In [None]:
ls -l # check the .tgz is there

In [None]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [None]:
!pip install -q findspark

In [None]:

!pip install py4j

# For maps
!pip install folium
!pip install plotly

Define the environment

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [None]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Columns and Expressions") \
        .master("local[*]") \
        .getOrCreate()

spark.version

In [None]:
spark

In [None]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [None]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/cars.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/movies.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/more_cars.json -P /dataset
!ls /dataset

Read JSON file

In [None]:
carsDF = spark.read \
    .option("inferSchema", True) \
    .json("/dataset/cars.json")

## Examples

Select a column

In [None]:
carsDF.select(col("Name")).show(3, False)

We can use various metods to refer a column

In [None]:
# various select methods
carsDF.select(
    carsDF.Name,
    col("Acceleration"),
    "Weight_in_lbs"
).show(3)

Expressions. We can use SQL like expression inside select to make operations with a column

In [None]:
#
carsWithKgDF = carsDF.select(
    col("Name"),
    col("Weight_in_lbs"),
    (col("Weight_in_lbs")/2.2).cast("int").alias("Weight_in_kg_2"), #cast result to int
    expr("Weight_in_lbs / 1000").cast("string").alias("Weight_in_T") #cast result to str
)
carsWithKgDF.printSchema()
carsWithKgDF.show(3)

In [None]:
# with expressions
carsWithSelectExprWeightsDF = carsDF.selectExpr(
    "Name",
    "Weight_in_lbs",
    "Weight_in_lbs / 2.2"
  )
carsWithSelectExprWeightsDF.show(3)

### DF Processing

Add a column

In [None]:
carsWithKg3DF = carsDF.withColumn("Weight_in_kg_3", col("Weight_in_lbs") / 2.2)
carsWithKg3DF.show(3)

Rename a column

In [None]:
carsWithColumnRenamed = carsDF.withColumnRenamed("Weight_in_lbs", "Weight in pounds")
carsWithColumnRenamed.show(3)

In [None]:
# careful with column names
# carsWithColumnRenamed.selectExpr("Weight in pounds")

In [None]:
# as we hace special characters (spaces) we have to use the ``
carsWithColumnRenamed.selectExpr("`Weight in pounds`").show(3)

Remove a column

In [None]:
carsWithColumnRenamed.printSchema()

In [None]:
dropColsDF = carsWithColumnRenamed.drop("Cylinders", "Displacement")
dropColsDF.printSchema()


Filtering

In [None]:
nonUSCarsDF = carsDF.filter(col("Origin") != "USA")
nonUSCarsDF2 = carsDF.where(col("Origin") != "USA")
nonUSCarsDF.show(3)
print(f"{nonUSCarsDF.count()} == {nonUSCarsDF2.count()}")

In [None]:
# filtering with expression strings
americanCarsDF = carsDF.filter("Origin = 'USA'")
americanCarsDF.show(3)

Chain filters

In [None]:
americanPowerfulCarsDF = carsDF.filter(col("Origin") == "USA").filter(col("Horsepower") > 150)
americanPowerfulCarsDF2 = carsDF.filter((col("Origin") == "USA") & (col("Horsepower") > 150))
americanPowerfulCarsDF3 = carsDF.filter("Origin = 'USA' and Horsepower > 150")
americanPowerfulCarsDF.show(3)

## Exercises
1. Read the movies DF and select 2 columns of your choice
2. Create another column summing up the total profit of the movies = US_Gross + Worldwide_Gross + DVD sales. Are you obtaining nulls? How you can solve it?
3. Select all COMEDY movies with IMDB rating above 6
Use as many versions as possible

Exercise 1

In [None]:
moviesDF = spark.read \
    .option("inferSchema", True) \
    .json("/dataset/movies.json")

moviesDF.printSchema()
moviesDF.select(col("Director"),col("Distributor")).show(2, False)

root
 |-- Creative_Type: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- IMDB_Rating: double (nullable = true)
 |-- IMDB_Votes: long (nullable = true)
 |-- MPAA_Rating: string (nullable = true)
 |-- Major_Genre: string (nullable = true)
 |-- Production_Budget: long (nullable = true)
 |-- Release_Date: string (nullable = true)
 |-- Rotten_Tomatoes_Rating: long (nullable = true)
 |-- Running_Time_min: long (nullable = true)
 |-- Source: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- US_DVD_Sales: long (nullable = true)
 |-- US_Gross: long (nullable = true)
 |-- Worldwide_Gross: long (nullable = true)

+--------+-----------+
|Director|Distributor|
+--------+-----------+
|NULL    |Gramercy   |
|NULL    |Strand     |
+--------+-----------+
only showing top 2 rows



Exercise 2

In [None]:
moviesDF = moviesDF.withColumn(
    "Total_Profit",
    coalesce(col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_sales"))
)
moviesDF.show(5, False)

+--------------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------+--------------------------+------------+--------+---------------+------------+
|Creative_Type       |Director|Distributor|IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source             |Title                     |US_DVD_Sales|US_Gross|Worldwide_Gross|Total_Profit|
+--------------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------+--------------------------+------------+--------+---------------+------------+
|NULL                |NULL    |Gramercy   |6.1        |1071      |R          |NULL       |8000000          |12-Jun-98   |NULL                  |NULL            |NULL               |The Land Girls            |NULL        

Exercise 3

In [None]:
moviesDF.createOrReplaceTempView("movies_table")
resultDF = spark.sql("""
    SELECT *
    FROM movies_table
    WHERE Major_Genre = 'Comedy' AND IMDB_Rating > 6
""")

# Show the result DataFrame
resultDF.show(5, False)

+--------------------+----------------+-------------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------+--------------------------+------------+--------+---------------+
|Creative_Type       |Director        |Distributor  |IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source             |Title                     |US_DVD_Sales|US_Gross|Worldwide_Gross|
+--------------------+----------------+-------------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------+--------------------------+------------+--------+---------------+
|NULL                |NULL            |Lionsgate    |6.8        |865       |NULL       |Comedy     |250000           |28-Aug-98   |NULL                  |NULL            |NULL               |I Married a Strange Person|NULL       