# Step 1: Extracting Data from Multiple Sources

### Background & Setup
This notebook is designed to guide you through the extraction of data from multiple sources using PySpark.

**Required Libraries:**

**Import necessary libraries and Start Spark Session**

In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
import logging

# Initialize Spark session
spark = SparkSession.builder.appName("VideoStreamingETL").getOrCreate()

# Reduce verbose logging
spark.sparkContext.setLogLevel("ERROR")

# Configure logging to suppress warnings
logging.getLogger("py4j").setLevel(logging.ERROR)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/10 23:48:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


**Define dataset paths**

In [2]:
# Define dataset paths
viewing_history_path = "video_streaming_data/viewing_history.csv"
users_path = "video_streaming_data/users.json"
videos_catalog_path = "video_streaming_data/videos_catalog.parquet"
subscription_updates_path = "video_streaming_data/subscription_updates.csv"

## Task 1: Load CSV Data

In [4]:
# Load the CSV file using Spark
df = spark.read.options(header='true', inferSchema='true').csv(viewing_history_path)

In [5]:
# Preview the schema
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- video_id: integer (nullable = true)
 |-- watched_at: timestamp (nullable = true)
 |-- device_type: string (nullable = true)
 |-- account_status: string (nullable = true)



In [7]:
# Preview the data
df.show(truncate=False)

+-------+--------+-------------------+-----------+--------------+
|user_id|video_id|watched_at         |device_type|account_status|
+-------+--------+-------------------+-----------+--------------+
|21558  |10308   |2024-01-01 00:00:00|iPhoneX    |inactive      |
|190681 |6220    |2024-01-01 00:00:01|iPhoneX    |inactive      |
|190802 |3119    |2024-01-01 00:00:02|iPhoneX    |active        |
|40874  |14184   |2024-01-01 00:00:03|iPhoneX    |active        |
|92704  |13010   |2024-01-01 00:00:04|iPhoneX    |NULL          |
|134289 |4249    |2024-01-01 00:00:05|Android    |inactive      |
|82958  |14470   |2024-01-01 00:00:06|Android    |active        |
|25131  |3244    |2024-01-01 00:00:07|NULL       |NULL          |
|144215 |19549   |2024-01-01 00:00:08|NULL       |inactive      |
|146331 |6001    |2024-01-01 00:00:09|Windows    |active        |
|174960 |12087   |2024-01-01 00:00:10|Windows    |NULL          |
|143062 |6082    |2024-01-01 00:00:11|iPhoneX    |active        |
|173784 |1

## Task 2: Load JSON Data

In [13]:
# Read the JSON file using multiline mode
df_json = spark.read.option("multiline","true").json(users_path)

                                                                                

In [14]:
# Preview the schema
df_json.printSchema()

root
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- preferred_language: string (nullable = true)
 |-- subscription_date: string (nullable = true)
 |-- user_id: long (nullable = true)



In [15]:
# Preview the data
df_json.show()

+--------------------+-------------------+------------------+-----------------+-------+
|               email|               name|preferred_language|subscription_date|user_id|
+--------------------+-------------------+------------------+-----------------+-------+
|jenniferblack@exa...|        John Duncan|           Spanish|       2022-12-01| 180981|
|johnbarker@exampl...|     Tammy Thompson|           English|             NULL| 244816|
|esherman@example.com|        Andrew Vega|            French|       2023-06-15| 619605|
|melissarivera@exa...|       Melissa Sims|            French|             NULL| 342762|
|gcopeland@example...|    Yvonne Anderson|           Spanish|       2023-06-15| 236364|
|griffinmichelle@e...|   Gregory Williams|            French|       2023-06-15| 257955|
|wilsonsandra@exam...|     Kimberly White|           English|             NULL| 845355|
|robertmedina@exam...|  Justin Cunningham|           English|       2023-06-15| 899823|
|xschwartz@example...|       Jul

## Task 3: Load Parquet Data

In [16]:
# Read the Parquet file
df_parquet = spark.read.parquet(videos_catalog_path)

In [17]:
# Preview the schema
df_parquet.printSchema()

root
 |-- video_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- category_id: string (nullable = true)



In [18]:
# Preview the data
df_parquet.show()

+--------+----------+-----------+
|video_id|     title|category_id|
+--------+----------+-----------+
|    1000|Video_1000|          2|
|    1001|Video_1001|          2|
|    1002|Video_1002|          3|
|    1003|Video_1003|          1|
|    1004|Video_1004|          1|
|    1005|Video_1005|          1|
|    1006|Video_1006|          2|
|    1007|Video_1007|          1|
|    1008|Video_1008|    unknown|
|    1009|Video_1009|          1|
|    1010|Video_1010|          1|
|    1011|Video_1011|          2|
|    1012|Video_1012|          1|
|    1013|Video_1013|    unknown|
|    1014|Video_1014|          3|
|    1015|Video_1015|          2|
|    1016|Video_1016|    unknown|
|    1017|Video_1017|          1|
|    1018|Video_1018|          3|
|    1019|Video_1019|          2|
+--------+----------+-----------+
only showing top 20 rows



## Task 4: Extract Incremental Updates

In [29]:
from pyspark.sql.functions import col

# Enable header and schema inference:
df_sub = spark.read.options(inferSchema="true", header="true").csv(subscription_updates_path)
df_sub.show()

# Extract only records from February 7, 2024 onwards
df_sub = df_sub.filter(col("change_date") >= "2024-02-07")


+-------+-------------------+-------------------+-------------------+
|user_id|subscription_status|        change_date|cancellation_reason|
+-------+-------------------+-------------------+-------------------+
| 130320|             active|2024-02-01 00:00:00|            content|
| 159457|             active|2024-02-01 00:01:00|            content|
| 158158|             active|2024-02-01 00:02:00|              price|
|  61946|            renewed|2024-02-01 00:03:00|              price|
| 170368|            renewed|2024-02-01 00:04:00|            content|
|  66434|             active|2024-02-01 00:05:00|               NULL|
|  96212|          cancelled|2024-02-01 00:06:00|            content|
| 103617|          cancelled|2024-02-01 00:07:00|               NULL|
| 132397|            renewed|2024-02-01 00:08:00|            content|
|  32356|            renewed|2024-02-01 00:09:00|              other|
|  39990|             active|2024-02-01 00:10:00|              price|
| 113447|          c

In [30]:
# Preview the data
df_sub.show()

+-------+-------------------+-------------------+-------------------+
|user_id|subscription_status|        change_date|cancellation_reason|
+-------+-------------------+-------------------+-------------------+
| 160619|          cancelled|2024-02-07 00:00:00|              price|
|  14529|            renewed|2024-02-07 00:01:00|              other|
| 105150|            renewed|2024-02-07 00:02:00|              other|
|  86291|             active|2024-02-07 00:03:00|              price|
| 162671|             active|2024-02-07 00:04:00|            content|
|  40590|             active|2024-02-07 00:05:00|               NULL|
|  96689|             active|2024-02-07 00:06:00|              price|
|  63004|            renewed|2024-02-07 00:07:00|               NULL|
|  16426|             active|2024-02-07 00:08:00|              price|
| 110566|             active|2024-02-07 00:09:00|              price|
| 148138|             active|2024-02-07 00:10:00|              price|
| 133143|           