In [1]:
# Import findspark and initialize. 
import findspark
findspark.init()

In [2]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/08 11:32:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/2/better_netflix_titles.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("better_netflix_titles.csv"), header=True, inferSchema=True)

# Show DataFrame
df.show()

+---+-------+-------+------+--------------------+----------+------------+------+---------+
| id|show_id|   type| title|             country|date_added|release_year|rating| duration|
+---+-------+-------+------+--------------------+----------+------------+------+---------+
|  0|     s1|TV Show|    3%|              Brazil| 14-Aug-20|        2020| TV-MA|4 Seasons|
|  1|     s2|  Movie|  7:19|              Mexico| 23-Dec-16|        2016| TV-MA|   93 min|
|  2|     s3|  Movie| 23:59|           Singapore| 20-Dec-18|        2011|     R|   78 min|
|  3|     s4|  Movie|     9|       United States| 16-Nov-17|        2009| PG-13|   80 min|
|  4|     s5|  Movie|    21|       United States|  1-Jan-20|        2008| PG-13|  123 min|
|  5|     s6|TV Show|    46|              Turkey|  1-Jul-17|        2016| TV-MA| 1 Season|
|  6|     s7|  Movie|   122|               Egypt|  1-Jun-20|        2019| TV-MA|   95 min|
|  7|     s8|  Movie|   187|       United States|  1-Nov-19|        1997|     R|  119 min|

In [4]:
# Create our temporary view
df.createOrReplaceTempView('movies')

In [5]:
# We can perform most any SQL action at this point
# here we are converting the date to a more workable date object
#NOTE: since we are not assigning this to a dataframe the change is not saved.
spark.sql("""SELECT show_id, 
   type, 
   title, 
   country, 
   TO_DATE(date_added, 'MMMM d, yyyy') 
   AS date_added, 
   release_year, 
   rating, 
   duration 
   FROM movies 
   WHERE date_added IS NOT null AND type='Movie'""").show(10)

+-------+-----+-----+-------------+----------+------------+------+--------+
|show_id| type|title|      country|date_added|release_year|rating|duration|
+-------+-----+-----+-------------+----------+------------+------+--------+
|     s2|Movie| 7:19|       Mexico|      null|        2016| TV-MA|  93 min|
|     s3|Movie|23:59|    Singapore|      null|        2011|     R|  78 min|
|     s4|Movie|    9|United States|      null|        2009| PG-13|  80 min|
|     s5|Movie|   21|United States|      null|        2008| PG-13| 123 min|
|     s7|Movie|  122|        Egypt|      null|        2019| TV-MA|  95 min|
|     s8|Movie|  187|United States|      null|        1997|     R| 119 min|
|     s9|Movie|  706|        India|      null|        2019| TV-14| 118 min|
|    s10|Movie| 1920|        India|      null|        2008| TV-MA| 143 min|
|    s11|Movie| 1922|United States|      null|        2017| TV-MA| 103 min|
|    s14|Movie|2,215|     Thailand|      null|        2018| TV-MA|  89 min|
+-------+---

In [6]:
# All of the SQL you learned in Unit 6 is available to you in Spark SQL
# Here we are listing out the counts by rating
# NOTE: it is almost NEVER a good idea to "order by" when using Spark with large datasets (more on this in 8.2)
spark.sql("""
  SELECT
    rating,
    count(*) AS number_of_ratings
  FROM movies
  GROUP BY rating
  ORDER BY 2 DESC
  """).show()

+--------+-----------------+
|  rating|number_of_ratings|
+--------+-----------------+
|   TV-MA|             2863|
|   TV-14|             1931|
|   TV-PG|              805|
|       R|              665|
|   PG-13|              386|
|    TV-Y|              280|
|   TV-Y7|              271|
|      PG|              247|
|    TV-G|              194|
|      NR|               84|
|       G|               39|
|    null|                9|
|TV-Y7-FV|                6|
|      UR|                5|
|   NC-17|                3|
+--------+-----------------+



In [7]:
# Let's output a file with just listing for children
# first we will use our spark sql to write to a dataframe

out_df= spark.sql("""
  SELECT 
  title,
  rating,
  date_added,
  duration
  FROM Movies
  WHERE rating IN ('G','PG', 'PG-13')""")

# Make sure we got what we wanted
out_df.show()

+--------------------+------+----------+--------+
|               title|rating|date_added|duration|
+--------------------+------+----------+--------+
|                   9| PG-13| 16-Nov-17|  80 min|
|                  21| PG-13|  1-Jan-20| 123 min|
|            Æon Flux| PG-13|  1-Feb-18|  93 min|
|         10,000 B.C.| PG-13|  1-Jun-19| 109 min|
|           16 Blocks| PG-13|  1-Nov-19| 102 min|
|            17 Again| PG-13|  1-Jan-21| 102 min|
|20 Feet From Stardom| PG-13| 22-Sep-18|  91 min|
|             28 Days| PG-13| 30-Sep-20| 104 min|
|      3 Days to Kill| PG-13|  1-Dec-20| 117 min|
|       3 Generations| PG-13| 28-Aug-17|  92 min|
|            3 Idiots| PG-13|  1-Aug-19| 164 min|
|        5 Flights Up| PG-13| 17-Mar-19|  92 min|
|      50 First Dates| PG-13|  1-Dec-20|  99 min|
|        A 2nd Chance|    PG|  1-Jul-17|  95 min|
|     A Boy Called Po|    PG| 15-Jan-18|  94 min|
|    A Bridge Too Far|    PG|  1-Jul-20| 176 min|
|A California Chri...| PG-13| 14-Dec-20| 107 min|


In [8]:
#  As Spark stores the data in partitions, it will also write data in partitions.
#  These partitions will always be stored in a folder with the same name as the file, and that folder may often contain many subfolders or files.
#  Within the partition folder, there will be a file or files that starts with `part-`, these are CSV files. 
# However, they are often not optimal for friendly reading, but can be downloaded to your computer.

out_df.write.csv('movies_out_spark.csv')

In [9]:
# The easiest work around of the part file output is to take the data to Pandas and write out a CSV.
# This forces the data to the master node and is not recommended unless you have filtered and/or aggregated your data to a reasonable size.

out_df.toPandas().to_csv('movies_out_pandas.csv')