In [0]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [0]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("Postgres connection") \
    .config("spark.jars", "/usr/local/postgresql-42.2.5.jar") \
    .getOrCreate()

# Set connection parameters
url = "jdbc:postgresql://13.213.34.35:5432/postgres"
table = "film"
table2 = "inventory"
table3 = "category"
properties = {
    "user": "postgres",
    "password": "sakila",
    "driver": "org.postgresql.Driver"
}
# Read table into DataFrame
df = spark.read.jdbc(url=url, table=table, properties=properties) #read table film
df2 = spark.read.jdbc(url=url, table=table2, properties=properties) #read table inventory

In [0]:
df.show(2)


+-------+----------------+--------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+--------------------+--------------------+--------------------+
|film_id|           title|         description|release_year|language_id|original_language_id|rental_duration|rental_rate|length|replacement_cost|rating|         last_update|    special_features|            fulltext|
+-------+----------------+--------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+--------------------+--------------------+--------------------+
|      1|ACADEMY DINOSAUR|A Epic Drama of a...|        2006|          1|                null|              6|       0.99|    86|           20.99|    PG|2007-09-10 17:46:...|[Deleted Scenes, ...|'academi':1 'batt...|
|      2|  ACE GOLDFINGER|A Astounding Epis...|        2006|          1|                null|              3|       4.99|    48|        

In [0]:
df2.show(5)

+------------+-------+--------+-------------------+
|inventory_id|film_id|store_id|        last_update|
+------------+-------+--------+-------------------+
|           1|      1|       1|2006-02-15 10:09:17|
|           2|      1|       1|2006-02-15 10:09:17|
|           3|      1|       1|2006-02-15 10:09:17|
|           4|      1|       1|2006-02-15 10:09:17|
|           5|      1|       2|2006-02-15 10:09:17|
+------------+-------+--------+-------------------+
only showing top 5 rows



In [0]:
# left join
from pyspark.sql.functions import col,struct,when
df.join(df2,df2.film_id ==  df.film_id,"left")\
    .select(df2.film_id,df.title,df.release_year,df.language_id,df2.inventory_id,df2.store_id,df2.last_update)\
    .show(10)
    


+-------+----------------+------------+-----------+------------+--------+-------------------+
|film_id|           title|release_year|language_id|inventory_id|store_id|        last_update|
+-------+----------------+------------+-----------+------------+--------+-------------------+
|      1|ACADEMY DINOSAUR|        2006|          1|           1|       1|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|           2|       1|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|           3|       1|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|           4|       1|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|           5|       2|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|           6|       2|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|           7|       2|2006-02-15 10:09:17|
|      1|ACADEMY DINOSAUR|        2006|          1|         

In [0]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType


In [0]:
df3.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- last_update: timestamp (nullable = true)



In [0]:
from pyspark.sql.functions import col,struct,when
# Read table into DataFrame
df3 = spark.read.jdbc(url=url, table=table3, properties=properties)
df3.show(5)

+-----------+---------+-------------------+
|category_id|     name|        last_update|
+-----------+---------+-------------------+
|          1|   Action|2006-02-15 09:46:27|
|          2|Animation|2006-02-15 09:46:27|
|          3| Children|2006-02-15 09:46:27|
|          4| Classics|2006-02-15 09:46:27|
|          5|   Comedy|2006-02-15 09:46:27|
+-----------+---------+-------------------+
only showing top 5 rows



In [0]:
#Using when() otherwise() on PySpark DataFrame.
category_initial = df3.withColumn("category_code", 
                                 when (df3.category_id == 1,"Dewasa")
                                  .when (df3.category_id == 2, "Anak")
                                  .otherwise("Semua Umur")
                                 ) 
category_initial.show(5)

+-----------+---------+-------------------+-------------+
|category_id|     name|        last_update|category_code|
+-----------+---------+-------------------+-------------+
|          1|   Action|2006-02-15 09:46:27|       Dewasa|
|          2|Animation|2006-02-15 09:46:27|         Anak|
|          3| Children|2006-02-15 09:46:27|   Semua Umur|
|          4| Classics|2006-02-15 09:46:27|   Semua Umur|
|          5|   Comedy|2006-02-15 09:46:27|   Semua Umur|
+-----------+---------+-------------------+-------------+
only showing top 5 rows



In [0]:
#Using when() otherwise() on PySpark DataFrame.
df3 = df3.withColumn("category_code", 
                    when(df3.name == "Action","AC")
                    .when(df3.name == "Animation","AN")
                    .when(df3.name == "Children","CH")
                    .when(df3.name == "Classics","CS")
                    .when(df3.name == "Comedy","CD")
                    .when(df3.name == "Documentary","DC")
                    .when(df3.name == "Drama","DR")
                    .when(df3.name == "Family","FM")
                    .when(df3.name == "Foreign","FR")
                    .when(df3.name == "Games","GM")
                    .when(df3.name == "Horror","HR")
                    .otherwise("unknown")
                   )
df3.show(5)

+-----------+---------+-------------------+-------------+
|category_id|     name|        last_update|category_code|
+-----------+---------+-------------------+-------------+
|          1|   Action|2006-02-15 09:46:27|           AC|
|          2|Animation|2006-02-15 09:46:27|           AN|
|          3| Children|2006-02-15 09:46:27|           CH|
|          4| Classics|2006-02-15 09:46:27|           CS|
|          5|   Comedy|2006-02-15 09:46:27|           CD|
+-----------+---------+-------------------+-------------+
only showing top 5 rows

