In [1]:
from pyspark.sql.types import *
from pyspark.sql.window import Window

import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate() 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/20 23:27:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark.read.text("hdfs://namenode:8020/user/datapath/datasets/customers/").show()

                                                                                

+--------------------+
|               value|
+--------------------+
|9327,Donna,Smith,...|
|9328,Mary,Perez,X...|
|9329,Eugene,Powel...|
|9330,Mary,Conley,...|
|9331,Donna,Smith,...|
|9332,Mary,Jordan,...|
|9333,Angela,Mills...|
|9334,Mary,Johnsto...|
|9335,Joseph,Smith...|
|9336,Janice,Guzma...|
|9337,Mary,Smith,X...|
|9338,James,Davis,...|
|9339,Ann,Moyer,XX...|
|9340,Mary,Smith,X...|
|9341,Karen,Collin...|
|9342,Teresa,Grant...|
|9343,Mary,Knapp,X...|
|9344,Kelly,Smith,...|
|9345,Mary,Branch,...|
|9346,Jack,Smith,X...|
+--------------------+
only showing top 20 rows



In [15]:
CUSTOMERS_DATA =   'hdfs://namenode:8020/user/datapath/datasets/customers'
DEPARTMENTS_DATA = 'hdfs://namenode:8020/user/datapath/datasets/departments'
CATEGORIES_DATA =  'hdfs://namenode:8020/user/datapath/datasets/categories'
PRODUCTS_DATA =    'hdfs://namenode:8020/user/datapath/datasets/products'
ORDERS_DATA =      'hdfs://namenode:8020/user/datapath/datasets/orders'
ORDER_ITEMS_DATA = 'hdfs://namenode:8020/user/datapath/datasets/order_items'

In [16]:
# define the schema, corresponding to a line in the csv data file for Customer
customers_schema = StructType([
    StructField('customer_id',       IntegerType(), nullable=True),
    StructField('customer_fname',    StringType(), nullable=True),
    StructField('customer_lname',    StringType(), nullable=True),
    StructField('customer_email',    StringType(), nullable=True),
    StructField('customer_password', StringType(), nullable=True),
    StructField('customer_street',   StringType(), nullable=True),
    StructField('customer_city',     StringType(), nullable=True),
    StructField('customer_state',    StringType(), nullable=True),
    StructField('customer_zipcode',  StringType(), nullable=True)])

In [17]:
departments_schema = StructType([
    StructField('department_id',   IntegerType(), nullable=True),
    StructField('department_name', StringType(), nullable=True)])

In [18]:
categories_schema = StructType([
    StructField('category_id',            IntegerType(), nullable=True),
    StructField('category_department_id', IntegerType(), nullable=True),
    StructField('category_name',          StringType(), nullable=True)])

In [19]:
products_schema = StructType([
    StructField('product_id',          IntegerType(), nullable=True),
    StructField('product_category_id', IntegerType(), nullable=True),
    StructField('product_name',        StringType(), nullable=True),
    StructField('product_description', StringType(), nullable=True),
    StructField('product_price',       FloatType(), nullable=True),
    StructField('product_image',       StringType(), nullable=True)])

In [20]:
orders_schema = StructType([
    StructField('order_id',          IntegerType(), nullable=True),
    StructField('order_date',        StringType(), nullable=True),
    StructField('order_customer_id', IntegerType(), nullable=True),
    StructField('order_status',      StringType(), nullable=True)])

In [21]:
order_items_schema = StructType([
    StructField('order_item_id',            IntegerType(), nullable=True),
    StructField('order_item_order_id',      IntegerType(), nullable=True),
    StructField('order_item_product_id',    IntegerType(), nullable=True),
    StructField('order_item_quantity',      IntegerType(), nullable=True),
    StructField('order_item_subtotal',      FloatType(), nullable=True),
    StructField('order_item_product_price', FloatType(), nullable=True)])

### Cargamos los datos

In [22]:

customers_df = spark.read.csv(path=CUSTOMERS_DATA, schema=customers_schema)
customers_df.cache()

departments_df = spark.read.csv(path=DEPARTMENTS_DATA, schema=departments_schema)
departments_df.cache()

categories_df = spark.read.csv(path=CATEGORIES_DATA, schema=categories_schema)
categories_df.cache()

products_df = spark.read.csv(path=PRODUCTS_DATA, schema=products_schema)
products_df.cache()

orders_df = spark.read.csv(path=ORDERS_DATA, schema=orders_schema)
orders_df.cache()

order_items_df = spark.read.csv(path=ORDER_ITEMS_DATA, schema=order_items_schema)
order_items_df.cache()

DataFrame[order_item_id: int, order_item_order_id: int, order_item_product_id: int, order_item_quantity: int, order_item_subtotal: float, order_item_product_price: float]

In [23]:
customers_df.show()

[Stage 1:>                                                          (0 + 2) / 2]

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+
|       9327|         Donna|         Smith|     XXXXXXXXX|        XXXXXXXXX|4114 Clear Nectar...|       Caguas|            PR|           00725|
|       9328|          Mary|         Perez|     XXXXXXXXX|        XXXXXXXXX|  376 Golden Orchard|Moreno Valley|            CA|           92553|
|       9329|        Eugene|        Powell|     XXXXXXXXX|        XXXXXXXXX|   2161 Burning Maze|     Metairie|            LA|           70003|
|       9330|          Mary|        Conley|     XXXXXXXXX|        XXXXXXXXX| 3046 Broad Sky Dale|       Caguas|            PR|          

                                                                                

### Creamos vistas temporales para trabajar con spark.sql

In [None]:
customers_df.createOrReplaceTempView("customers")
customers_df.show(5)

In [None]:
departments_df.createOrReplaceTempView("departments")
departments_df.show(5)

In [None]:
orders_df.createOrReplaceTempView("orders")
orders_df.show(5)

In [None]:
order_items_df.createOrReplaceTempView("order_items")
order_items_df.show(5)

In [None]:
products_df.createOrReplaceTempView("products")
products_df.show(5)

In [None]:
categories_df.createOrReplaceTempView("categories")
categories_df.show(5)