In [1]:
import sys; sys.path.insert(0, '..')
import findspark; findspark.init()

In [2]:
import json
import configparser
from os import environ, listdir, path, getcwd
from pathlib import Path

from pyspark import SparkConf
from pyspark import SparkFiles
from pyspark.sql import SparkSession

from src.commons import utils
from src.user_item_graph import etl, job, udf

In [3]:
print(sys.path)

['/opt/spark/python', '/opt/spark/python/lib/py4j-0.10.9-src.zip', '..', '/home/zdjohn/dev/spark-setup-workshop/notebook', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/zdjohn/dev/spark-setup-workshop/.tox/dev/lib/python3.8/site-packages', '/home/zdjohn/dev/spark-setup-workshop/.tox/dev/lib/python3.8/site-packages/IPython/extensions', '/home/zdjohn/.ipython']


In [4]:
environ['PYSPARK_SUBMIT_ARGS'] = "--packages=com.amazonaws:aws-java-sdk:1.11.900,org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"
environ['DEBUG'] = "1"
environ['PYSPARK_PYTHON']=f'{Path(getcwd()).parent}/.tox/dev/bin/python'

In [5]:
session, logger= utils.start_spark()

In [6]:
source_domain = 'video'
target_domain = 'music'
source_reviews=f's3a://pyspark3-sample/{source_domain}{target_domain}_reviews'
target_reviews=f's3a://pyspark3-sample/{target_domain}{source_domain}_reviews'
customers_indexed_ids = f's3a://pyspark3-sample/{source_domain}_{target_domain}_customer_ids'
source_product_indexed_ids = f's3a://pyspark3-sample/{source_domain}{target_domain}_product_ids'
target_product_indexed_ids = f's3a://pyspark3-sample/{target_domain}{source_domain}_product_ids'

In [7]:
video_reviews_df = utils.extract_parquet_data(session, source_reviews)
music_reviews_df = utils.extract_parquet_data(session, target_reviews)
customers_indexed_ids_df = utils.extract_parquet_data(session, customers_indexed_ids)
video_product_indexed_ids_df = utils.extract_parquet_data(session, source_product_indexed_ids)
music_product_indexed_ids_df = utils.extract_parquet_data(session, target_product_indexed_ids)

In [8]:
video_product_indexed_ids_df.printSchema()

root
 |-- product_id_index: integer (nullable = true)
 |-- product_id: string (nullable = true)



In [9]:
video_customer_products = etl.to_products_grouped_by_customer(video_reviews_df, video_product_indexed_ids_df)
video_customer_products.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: integer (containsNull = false)



In [10]:
video_edges_by_customer = video_customer_products.select('customer_id', udf.udf_combination('products').alias("products_edges"))
video_edges_by_customer.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- products_edges: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [18]:
%%time
video_edges_by_customer.show(100)

+-----------+--------------------+
|customer_id|      products_edges|
+-----------+--------------------+
|   10008274|[8520-12432, 8520...|
|   10010722|[16327-36992, 163...|
|   10013137|       [19298-16311]|
|   10014336|[23285-37745, 232...|
|   10014651|[27452-26481, 274...|
|   10019143|[33415-27619, 334...|
|   10024365|[28531-36241, 285...|
|   10031733|[34882-8335, 3488...|
|   10037241|[28531-11141, 285...|
|   10043494|       [38726-39566]|
|   10057510|[31666-8771, 3166...|
|   10058862|[4970-25613, 4970...|
|   10059990|[33416-28873, 334...|
|   10066276|[5329-17764, 5329...|
|   10068905|[15659-41110, 156...|
|   10069493|[8559-23372, 8559...|
|   10070500|[25445-16270, 254...|
|   10072546|[31880-7690, 3188...|
|   10079769|[17779-31282, 177...|
|    1008035|       [34379-39922]|
|   10083340|[35735-36168, 357...|
|   10084386|[12403-12666, 124...|
|   10087837|[18737-12956, 187...|
|   10090130|[25106-12453, 251...|
|   10096787|[16407-34523, 164...|
|   10098024|[5060-3

In [11]:
video_edges_by_customer_p = video_customer_products.select('customer_id', udf.pandas_udf_combination('products').alias("products_edges"))
video_edges_by_customer_p.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- products_edges: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
%%time
video_edges_by_customer_p.show(100)

+-----------+--------------------+
|customer_id|      products_edges|
+-----------+--------------------+
|   10008274|[8520-12432, 8520...|
|   10010722|[16327-36992, 163...|
|   10013137|       [19298-16311]|
|   10014336|[23285-37745, 232...|
|   10014651|[27452-26481, 274...|
|   10019143|[33415-27619, 334...|
|   10024365|[28531-36241, 285...|
|   10031733|[34882-8335, 3488...|
|   10037241|[28531-11141, 285...|
|   10043494|       [38726-39566]|
|   10057510|[31666-8771, 3166...|
|   10058862|[4970-25613, 4970...|
|   10059990|[33416-28873, 334...|
|   10066276|[5329-17764, 5329...|
|   10068905|[15659-41110, 156...|
|   10069493|[8559-23372, 8559...|
|   10070500|[25445-16270, 254...|
|   10072546|[31880-7690, 3188...|
|   10079769|[17779-31282, 177...|
|    1008035|       [34379-39922]|
|   10083340|[35735-36168, 357...|
|   10084386|[12403-12666, 124...|
|   10087837|[18737-12956, 187...|
|   10090130|[25106-12453, 251...|
|   10096787|[16407-34523, 164...|
|   10098024|[5060-3