http://localhost:8888/notebooks/Mastering-Big-Data-Analytics-with-PySpark/Section%203%20-%20Preparing%20Data%20using%20SparkSQL/3.3/hands-on-3.3.ipynb

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("MyFirstCSVLoad").getOrCreate()

In [2]:
DATASET_HOME = "/home/wengong/projects/bigdata/Mastering-Big-Data-Analytics-with-PySpark/data-sets"
FILE_CSV = "ratings.csv"

In [3]:
ratings = (
    spark.read.csv(
        path=f"{DATASET_HOME}/ml-latest-small/{FILE_CSV}",
        sep=",",
        header=True,
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    )
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp")))
)

In [4]:
ratings.show(5, truncate=False)
ratings.printSchema()

+------+-------+------+-------------------+
|userId|movieId|rating|timestamp          |
+------+-------+------+-------------------+
|1     |1      |4.0   |2000-07-30 14:45:03|
|1     |3      |4.0   |2000-07-30 14:20:47|
|1     |6      |4.0   |2000-07-30 14:37:04|
|1     |47     |5.0   |2000-07-30 15:03:35|
|1     |50     |5.0   |2000-07-30 14:48:51|
+------+-------+------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [5]:
FILE_CSV = "movies.csv"

In [6]:
movies = (
    spark.read.csv(
        path=f"{DATASET_HOME}/ml-latest-small/{FILE_CSV}",
        sep=",",
        header=True,
        quote='"',
        schema="movieId INT, title STRING, genres STRING",
    )
)
movies.show(5, truncate=False)
movies.printSchema()

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [7]:
movies.where(f.col("genres") == "Action").show(5, False)
movies.where("genres == 'Action'").show(5, False)

+-------+-----------------------------------------------------------+------+
|movieId|title                                                      |genres|
+-------+-----------------------------------------------------------+------+
|9      |Sudden Death (1995)                                        |Action|
|71     |Fair Game (1995)                                           |Action|
|204    |Under Siege 2: Dark Territory (1995)                       |Action|
|251    |Hunted, The (1995)                                         |Action|
|667    |Bloodsport 2 (a.k.a. Bloodsport II: The Next Kumite) (1996)|Action|
+-------+-----------------------------------------------------------+------+
only showing top 5 rows

+-------+-----------------------------------------------------------+------+
|movieId|title                                                      |genres|
+-------+-----------------------------------------------------------+------+
|9      |Sudden Death (1995)                       

#### split and explode

In [8]:
movie_genre = (
    movies
    .withColumn("genres_array", f.split("genres", "\|"))
    .withColumn("genre", f.explode("genres_array"))
    .select("movieId", "title", "genre")
)

In [9]:
movie_genre.show(10, False)

+-------+-----------------------+---------+
|movieId|title                  |genre    |
+-------+-----------------------+---------+
|1      |Toy Story (1995)       |Adventure|
|1      |Toy Story (1995)       |Animation|
|1      |Toy Story (1995)       |Children |
|1      |Toy Story (1995)       |Comedy   |
|1      |Toy Story (1995)       |Fantasy  |
|2      |Jumanji (1995)         |Adventure|
|2      |Jumanji (1995)         |Children |
|2      |Jumanji (1995)         |Fantasy  |
|3      |Grumpier Old Men (1995)|Comedy   |
|3      |Grumpier Old Men (1995)|Romance  |
+-------+-----------------------+---------+
only showing top 10 rows



In [10]:
available_genres = movie_genre.select("genre").distinct()

In [11]:
available_genres.show()

+------------------+
|             genre|
+------------------+
|             Crime|
|           Romance|
|          Thriller|
|         Adventure|
|             Drama|
|               War|
|       Documentary|
|           Fantasy|
|           Mystery|
|           Musical|
|         Animation|
|         Film-Noir|
|(no genres listed)|
|              IMAX|
|            Horror|
|           Western|
|            Comedy|
|          Children|
|            Action|
|            Sci-Fi|
+------------------+



In [12]:
movies_without_genre = movies.where(f.col("genres") == "(no genres listed)")

In [15]:
print(movies_without_genre.count())
movies_without_genre.show(truncate=False)

34
+-------+-------------------------------------------------------+------------------+
|movieId|title                                                  |genres            |
+-------+-------------------------------------------------------+------------------+
|114335 |La cravate (1957)                                      |(no genres listed)|
|122888 |Ben-hur (2016)                                         |(no genres listed)|
|122896 |Pirates of the Caribbean: Dead Men Tell No Tales (2017)|(no genres listed)|
|129250 |Superfast! (2015)                                      |(no genres listed)|
|132084 |Let It Be Me (1995)                                    |(no genres listed)|
|134861 |Trevor Noah: African American (2013)                   |(no genres listed)|
|141131 |Guardians (2016)                                       |(no genres listed)|
|141866 |Green Room (2015)                                      |(no genres listed)|
|142456 |The Brand New Testament (2015)                       