# Module 2 - Part 2

### Working with Structured operations in PySpark

1. Columns and Expressions
1. Filter and Where Conditions
1. Distinct, Drop Duplicated, Order By
1. Rows and Union
1. Adding, Renaming, Dropping Columns
1. Working with missing and "bad" data
1. User-defined functions

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

#assign schema
per_schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("first_name", StringType(), True),
  StructField("last_name", StringType(), True),
  StructField("fav_movies", ArrayType(StringType()), True),
  StructField("salary", FloatType(), True),
  StructField("image_url", StringType(), True),
  StructField("date_of_birth", DateType(), True),
  StructField("active", BooleanType(), True)
])

#Load data
json_file_path =  "dbfs:/FileStore/module2/persons.json"
persons_df = (spark.read.json(json_file_path, per_schema, multiLine="True"))


In [0]:
persons_df.printSchema()


In [0]:
persons_df.show(7)
persons_df.show(5, truncate=False)

##### 1. Columns and Expressions

In [0]:
from pyspark.sql.functions import col, expr

In [0]:
persons_df.select(col("first_name"), col("last_name"), col("date_of_birth")).show(5)

In [0]:
persons_df.select(expr("first_name"), expr("last_name"), expr("date_of_birth")).show(5)

In [0]:
from pyspark.sql.functions import concat_ws

In [0]:
# Concatenate and calculate salary - add 10%
(persons_df.select(concat_ws(' ', col("first_name"), col("last_name")).alias("full_name"),
                 col("salary"),
                 col("salary")* 0.10 + col("salary")).alias("salary_increase")).show(10)

In [0]:
(persons_df.select(concat_ws(' ', col("first_name"), col("last_name")).alias("full_name"),
                 col("salary"),
                 expr("salary * 0.10 + salary").alias("salary_increase"))).show(10)

##### 2. Filter and Where Conditions

In [0]:
#built-in functions: select, order by, group by, where, filter

persons_df.filter("salary <= 3000").show(10)

In [0]:
persons_df.where("salary <= 3000").show(10)

In [0]:
persons_df.where((col("salary") <= 3000) & (col("active") == True)).show(10)

In [0]:
from pyspark.sql.functions import year

In [0]:
persons_df.filter((year("date_of_birth") == 2000) | (year("date_of_birth") == 1989)).show()

In [0]:
#nested arrays
from pyspark.sql.functions import array_contains

In [0]:
persons_df.where(array_contains(persons_df.fav_movies, "Land of the Lost")).show()

##### 3. Distinct, Drop Duplicates, Order By

In [0]:
from pyspark.sql.functions import count, desc

In [0]:
persons_df.select("active").show(10)

In [0]:
# get unique values
persons_df.select("active").distinct().show()

In [0]:
# let us use orderBy, alias and get Years out 
(persons_df.select(col("first_name"),
                  year(col("date_of_birth")).alias("year"),
                  col("active")).orderBy("year", "first_name")).show(10)

In [0]:
#let  us remove the "duplicates"
dropped_df = (persons_df.select(col("first_name"),
                  year(col("date_of_birth")).alias("year"),
                  col("active")).dropDuplicates(["year","active"]).orderBy("year", "first_name"))

In [0]:
dropped_df.show()

In [0]:
# Order by Year by ASC order
# let us use orderBy, alias and get Years out 
(persons_df.select(col("first_name"),
                  year(col("date_of_birth")).alias("year"),
                  col("active")).orderBy("year", ascending = False)).show(10)

##### 4. Rows and union

In [0]:
from pyspark.sql import Row

In [0]:
person_row = Row(101, "Robert","Golob",["Lord of The Rings 1", "Lord of The Rings 2"], 10000.51, "http://gibanje-svoboda.si","1967-02-22", True)

In [0]:
# Create a variable that will hold list of rows
#person_rows_list = []
person_rows_list = [Row(102, "Mike","McCready",["Lord of The Rings 1", "Lord of The Rings 2"], 10000.51, "http://PearlJam.com","1967-12-22", True),
                    Row(103, "Eddie","Vedder",["Lord of The Rings 1", "Lord of The Rings 2"], 10000.51, "http://PearLJam.com","1967-06-22", True)]

In [0]:
person_rows_list.append(person_row)

In [0]:
print(person_rows_list)

In [0]:
person_row[2]

In [0]:
# infer schema
new_persons_df = spark.createDataFrame(person_rows_list, ["id","first_name", "last_name", "fav_movies", "salary", "image_url", "date_of_birth","active"])

In [0]:
#union all data / combine it with existing dataframe
add_persons_df = persons_df.union(new_persons_df)
add_persons_df.sort(desc("id")).show(10)

#### 5. Adding, Renaming, Dropping Columns

In [0]:
from pyspark.sql.functions import round

In [0]:
#add new column "Salary increase"
aug_persons_df1 = persons_df.withColumn("salary_increase", expr("salary * 0.10 + salary"))
aug_persons_df1.show(10)

In [0]:
#check if this column is "really" in DataFrame
aug_persons_df1.columns

In [0]:
# create 1 new column for year of birth and rename fav_movies, round the values and drop one column
aug_persons_df2 = (aug_persons_df1
                  .withColumn("birth_year", year("date_of_birth"))
                  .withColumnRenamed("fav_movies", "movies")
                  .withColumn("salary_x10", round(col("salary_increase"),2))
                  .drop("salary_increase"))

In [0]:
aug_persons_df2.show(10)

#### 6. Cleaning and working with missing \ bad data

In [0]:
bad_movies_list = [Row(None, None, None),
                   Row(None, None, 2022),
                   Row("John McTyre", "Another Great Movie", None),
                   Row(None, "Amazing Movie", 2021),
                   Row("May Field", None, 2011),
                   Row("Vlad Stopin", "We don't want war", 2022)
                  ]

In [0]:
bad_movies_list

In [0]:
bad_movies_columns = ["actor_name", "movie_title", "produced_year"]

In [0]:
bad_movies_df = spark.createDataFrame(bad_movies_list, schema=bad_movies_columns)

In [0]:

bad_movies_df.show()

In [0]:
#we see null values or rows with null values 
bad_movies_df.na.drop().show()

In [0]:
bad_movies_df.na.drop("any").show()

In [0]:
#drop just rows that are all empty
bad_movies_df.na.drop("all").show()

In [0]:
#get DataFrame where actor_name is not null
bad_movies_df.filter(col("actor_name").isNull() != True).show()

In [0]:
#get DataFrame where actor_name is null
bad_movies_df.filter(col("actor_name").isNull() != False).show()

In [0]:
#describe the values
bad_movies_df.describe("produced_year").show()

#### 7. Create user-defined function 
Using spark sql function because, you can not use Python functions

In [0]:
from pyspark.sql.functions import udf

In [0]:
students_list = [("John", 85),
                ("Mark", 90),
                ("Mike", 109),
                ("Marry", 79)
                ]

In [0]:
students_columns = ["name", "score"]

In [0]:
students_df = spark.createDataFrame(students_list, schema = students_columns)

In [0]:
students_df.show()

In [0]:
# create python function
def LetterGrade(score:int):
  grade = ''
  if score > 100:
    grade = "Uau"
  elif score >= 90:
    grade = "A"
  elif score >= 80:
    grade = "B"
  elif score >= 70:
    grade = "C"
  else:
    grade = "F"
  return grade

In [0]:
#before converting to spark function, lets test the function
print(LetterGrade(86))

In [0]:
letterGradeUDF = udf(LetterGrade)

In [0]:
students_df.select("name", "score", letterGradeUDF("score").alias("GradeLetter")).show(10)

In [0]:
#creating Spark UDF for converting string to unix-timestamp
from pyspark.sql import Row
from pyspark.sql.functions import unix_timestamp, to_timestamp


from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("TimeToUnix").getOrCreate())

spark.parallelize([Rows(dt='2022_03_08 11_44_12')]).toDF().withColumn("parsed", to_timestamp("dt", "yyyy_MM_dd HH_mm_ss")).show(1, False)