In [4]:
from pyspark.sql.types import *
from pyspark.sql.window import Window

import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate() 

In [6]:
spark.read.text("hdfs://namenode:8020/user/datapath/datasets/customers/").show()

                                                                                

+--------------------+
|               value|
+--------------------+
|Caguas,XXXXXXXXX,...|
|Moreno Valley,XXX...|
|Metairie,XXXXXXXX...|
|Caguas,XXXXXXXXX,...|
|Clementon,XXXXXXX...|
|Caguas,XXXXXXXXX,...|
|Los Angeles,XXXXX...|
|Tampa,XXXXXXXXX,M...|
|Caguas,XXXXXXXXX,...|
|Spring Valley,XXX...|
|Chicago,XXXXXXXXX...|
|Canoga Park,XXXXX...|
|Caguas,XXXXXXXXX,...|
|Victorville,XXXXX...|
|Chicago,XXXXXXXXX...|
|Caguas,XXXXXXXXX,...|
|Salina,XXXXXXXXX,...|
|Caguas,XXXXXXXXX,...|
|Aurora,XXXXXXXXX,...|
|Caguas,XXXXXXXXX,...|
+--------------------+
only showing top 20 rows



In [7]:
CUSTOMERS_DATA =   'hdfs://namenode:8020/user/datapath/datasets/customers'
DEPARTMENTS_DATA = 'hdfs://namenode:8020/user/datapath/datasets/departments'
CATEGORIES_DATA =  'hdfs://namenode:8020/user/datapath/datasets/categories'
PRODUCTS_DATA =    'hdfs://namenode:8020/user/datapath/datasets/products'
ORDERS_DATA =      'hdfs://namenode:8020/user/datapath/datasets/orders'
ORDER_ITEMS_DATA = 'hdfs://namenode:8020/user/datapath/datasets/order_items'

In [8]:
# define the schema, corresponding to a line in the csv data file for Customer
customers_schema = StructType([
    StructField('customer_id',       IntegerType(), nullable=True),
    StructField('customer_fname',    StringType(), nullable=True),
    StructField('customer_lname',    StringType(), nullable=True),
    StructField('customer_email',    StringType(), nullable=True),
    StructField('customer_password', StringType(), nullable=True),
    StructField('customer_street',   StringType(), nullable=True),
    StructField('customer_city',     StringType(), nullable=True),
    StructField('customer_state',    StringType(), nullable=True),
    StructField('customer_zipcode',  StringType(), nullable=True)])

In [9]:
departments_schema = StructType([
    StructField('department_id',   IntegerType(), nullable=True),
    StructField('department_name', StringType(), nullable=True)])

In [10]:
categories_schema = StructType([
    StructField('category_id',            IntegerType(), nullable=True),
    StructField('category_department_id', IntegerType(), nullable=True),
    StructField('category_name',          StringType(), nullable=True)])

In [11]:
products_schema = StructType([
    StructField('product_id',          IntegerType(), nullable=True),
    StructField('product_category_id', IntegerType(), nullable=True),
    StructField('product_name',        StringType(), nullable=True),
    StructField('product_description', StringType(), nullable=True),
    StructField('product_price',       FloatType(), nullable=True),
    StructField('product_image',       StringType(), nullable=True)])

In [12]:
orders_schema = StructType([
    StructField('order_id',          IntegerType(), nullable=True),
    StructField('order_date',        StringType(), nullable=True),
    StructField('order_customer_id', IntegerType(), nullable=True),
    StructField('order_status',      StringType(), nullable=True)])

In [13]:
order_items_schema = StructType([
    StructField('order_item_id',            IntegerType(), nullable=True),
    StructField('order_item_order_id',      IntegerType(), nullable=True),
    StructField('order_item_product_id',    IntegerType(), nullable=True),
    StructField('order_item_quantity',      IntegerType(), nullable=True),
    StructField('order_item_subtotal',      FloatType(), nullable=True),
    StructField('order_item_product_price', FloatType(), nullable=True)])

### Cargamos los datos

In [17]:

customers_df = spark.read.csv(path=CUSTOMERS_DATA, schema=customers_schema)
customers_df.cache()

departments_df = spark.read.csv(path=DEPARTMENTS_DATA, schema=departments_schema)
departments_df.cache()

categories_df = spark.read.csv(path=CATEGORIES_DATA, schema=categories_schema)
categories_df.cache()

products_df = spark.read.csv(path=PRODUCTS_DATA, schema=products_schema)
products_df.cache()

orders_df = spark.read.csv(path=ORDERS_DATA, schema=orders_schema)
orders_df.cache()

order_items_df = spark.read.csv(path=ORDER_ITEMS_DATA, schema=order_items_schema)
order_items_df.cache()

DataFrame[order_item_id: int, order_item_order_id: int, order_item_product_id: int, order_item_quantity: int, order_item_subtotal: float, order_item_product_price: float]

In [18]:
customers_df.show()

[Stage 4:>                                                          (0 + 2) / 2]

+-----------+--------------+--------------+--------------+-----------------+---------------+-------------+--------------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|customer_street|customer_city|      customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+---------------+-------------+--------------------+----------------+
|       NULL|     XXXXXXXXX|         Donna|          9327|            Smith|      XXXXXXXXX|           PR|4114 Clear Nectar...|           00725|
|       NULL|     XXXXXXXXX|          Mary|          9328|            Perez|      XXXXXXXXX|           CA|  376 Golden Orchard|           92553|
|       NULL|     XXXXXXXXX|        Eugene|          9329|           Powell|      XXXXXXXXX|           LA|   2161 Burning Maze|           70003|
|       NULL|     XXXXXXXXX|          Mary|          9330|           Conley|      XXXXXXXXX|           PR| 3046 Broad Sky Dale|   

                                                                                

### Creamos vistas temporales para trabajar con spark.sql

In [19]:
customers_df.createOrReplaceTempView("customers")
customers_df.show(5)

+-----------+--------------+--------------+--------------+-----------------+---------------+-------------+--------------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|customer_street|customer_city|      customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+---------------+-------------+--------------------+----------------+
|       NULL|     XXXXXXXXX|         Donna|          9327|            Smith|      XXXXXXXXX|           PR|4114 Clear Nectar...|           00725|
|       NULL|     XXXXXXXXX|          Mary|          9328|            Perez|      XXXXXXXXX|           CA|  376 Golden Orchard|           92553|
|       NULL|     XXXXXXXXX|        Eugene|          9329|           Powell|      XXXXXXXXX|           LA|   2161 Burning Maze|           70003|
|       NULL|     XXXXXXXXX|          Mary|          9330|           Conley|      XXXXXXXXX|           PR| 3046 Broad Sky Dale|   

In [20]:
departments_df.createOrReplaceTempView("departments")
departments_df.show(5)

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            6|       Outdoors|
|            7|       Fan Shop|
|            2|        Fitness|
|            3|       Footwear|
|            4|        Apparel|
+-------------+---------------+
only showing top 5 rows



In [21]:
orders_df.createOrReplaceTempView("orders")
orders_df.show(5)

[Stage 9:>                                                          (0 + 2) / 2]

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|    2373|2013-11-09 00:00:...|            17222|       COMPLETE|
|   12091|2013-11-09 00:00:...|            17223|PENDING_PAYMENT|
|     871|2013-11-09 00:00:...|            17224|        PENDING|
|    6381|2013-11-09 00:00:...|            17225|PENDING_PAYMENT|
|    4456|2013-11-09 00:00:...|            17226|        PENDING|
+--------+--------------------+-----------------+---------------+
only showing top 5 rows



                                                                                

In [22]:
order_items_df.createOrReplaceTempView("order_items")
order_items_df.show(5)



+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|       129149|              51686|                  403|               NULL|                1.0|                  129.99|
|       129150|              51687|                  403|               NULL|                1.0|                  129.99|
|       129151|              51687|                  403|               NULL|                1.0|                  129.99|
|       129152|              51687|                 1014|               NULL|                5.0|                   249.9|
|       129153|              51687|                  191|               NULL|                4.0|                  399.96|
+-------------+-

                                                                                

In [23]:
products_df.createOrReplaceTempView("products")
products_df.show(5)

+----------+-------------------+------------+--------------------+-------------+-------------+
|product_id|product_category_id|product_name| product_description|product_price|product_image|
+----------+-------------------+------------+--------------------+-------------+-------------+
|        45|               NULL|        1009|http://images.acm...|         NULL|       599.99|
|        46|               NULL|        1010|http://images.acm...|         NULL|        19.98|
|        46|               NULL|        1011|http://images.acm...|         NULL|       499.99|
|        46|               NULL|        1012|http://images.acm...|         NULL|       299.99|
|        46|               NULL|        1013|http://images.acm...|         NULL|       349.99|
+----------+-------------------+------------+--------------------+-------------+-------------+
only showing top 5 rows



In [24]:
categories_df.createOrReplaceTempView("categories")
categories_df.show(5)

+-----------+----------------------+-------------------+
|category_id|category_department_id|      category_name|
+-----------+----------------------+-------------------+
|          2|                     1|           Football|
|          2|                     2|             Soccer|
|          2|                     3|Baseball & Softball|
|          2|                     4|         Basketball|
|          2|                     5|           Lacrosse|
+-----------+----------------------+-------------------+
only showing top 5 rows



In [26]:
spark.sql("select * from categories").show()

+-----------+----------------------+-------------------+
|category_id|category_department_id|      category_name|
+-----------+----------------------+-------------------+
|          2|                     1|           Football|
|          2|                     2|             Soccer|
|          2|                     3|Baseball & Softball|
|          2|                     4|         Basketball|
|          2|                     5|           Lacrosse|
|          2|                     6|   Tennis & Racquet|
|          2|                     7|             Hockey|
|          2|                     8|        More Sports|
|          3|                     9|   Cardio Equipment|
|          3|                    10|  Strength Training|
|          3|                    11|Fitness Accessories|
|          3|                    12|       Boxing & MMA|
|          3|                    13|        Electronics|
|          3|                    14|     Yoga & Pilates|
|          3|                  