In [49]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession.\
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

spark

In [6]:
!hdfs dfs -cat /public/trendytech/retail_db/order_items/part-00000 |head

1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99
cat: Unable to write to output stream.


In [7]:
raw_df = spark.read\
.format("csv")\
.option("inferSchema","true")\
.load("/public/trendytech/retail_db/order_items/part-00000")

raw_df.show()

+---+---+----+---+------+------+
|_c0|_c1| _c2|_c3|   _c4|   _c5|
+---+---+----+---+------+------+
|  1|  1| 957|  1|299.98|299.98|
|  2|  2|1073|  1|199.99|199.99|
|  3|  2| 502|  5| 250.0|  50.0|
|  4|  2| 403|  1|129.99|129.99|
|  5|  4| 897|  2| 49.98| 24.99|
|  6|  4| 365|  5|299.95| 59.99|
|  7|  4| 502|  3| 150.0|  50.0|
|  8|  4|1014|  4|199.92| 49.98|
|  9|  5| 957|  1|299.98|299.98|
| 10|  5| 365|  5|299.95| 59.99|
| 11|  5|1014|  2| 99.96| 49.98|
| 12|  5| 957|  1|299.98|299.98|
| 13|  5| 403|  1|129.99|129.99|
| 14|  7|1073|  1|199.99|199.99|
| 15|  7| 957|  1|299.98|299.98|
| 16|  7| 926|  5| 79.95| 15.99|
| 17|  8| 365|  3|179.97| 59.99|
| 18|  8| 365|  5|299.95| 59.99|
| 19|  8|1014|  4|199.92| 49.98|
| 20|  8| 502|  1|  50.0|  50.0|
+---+---+----+---+------+------+
only showing top 20 rows



In [8]:
raw_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)



In [9]:
refined_df = raw_df.toDF("order_item_id","order_id","product_id","quantity","subtotal","product_price")

In [10]:
refined_df.show()

+-------------+--------+----------+--------+--------+-------------+
|order_item_id|order_id|product_id|quantity|subtotal|product_price|
+-------------+--------+----------+--------+--------+-------------+
|            1|       1|       957|       1|  299.98|       299.98|
|            2|       2|      1073|       1|  199.99|       199.99|
|            3|       2|       502|       5|   250.0|         50.0|
|            4|       2|       403|       1|  129.99|       129.99|
|            5|       4|       897|       2|   49.98|        24.99|
|            6|       4|       365|       5|  299.95|        59.99|
|            7|       4|       502|       3|   150.0|         50.0|
|            8|       4|      1014|       4|  199.92|        49.98|
|            9|       5|       957|       1|  299.98|       299.98|
|           10|       5|       365|       5|  299.95|        59.99|
|           11|       5|      1014|       2|   99.96|        49.98|
|           12|       5|       957|       1|  29

In [12]:
df1=refined_df.drop("subtotal")
df1.show()

+-------------+--------+----------+--------+-------------+
|order_item_id|order_id|product_id|quantity|product_price|
+-------------+--------+----------+--------+-------------+
|            1|       1|       957|       1|       299.98|
|            2|       2|      1073|       1|       199.99|
|            3|       2|       502|       5|         50.0|
|            4|       2|       403|       1|       129.99|
|            5|       4|       897|       2|        24.99|
|            6|       4|       365|       5|        59.99|
|            7|       4|       502|       3|         50.0|
|            8|       4|      1014|       4|        49.98|
|            9|       5|       957|       1|       299.98|
|           10|       5|       365|       5|        59.99|
|           11|       5|      1014|       2|        49.98|
|           12|       5|       957|       1|       299.98|
|           13|       5|       403|       1|       129.99|
|           14|       7|      1073|       1|       199.9

In [23]:
from pyspark.sql.functions import *
df1.select('*',expr("product_price * quantity as subtotal")).show()

+-------------+--------+----------+--------+-------------+--------+
|order_item_id|order_id|product_id|quantity|product_price|subtotal|
+-------------+--------+----------+--------+-------------+--------+
|            1|       1|       957|       1|       299.98|  299.98|
|            2|       2|      1073|       1|       199.99|  199.99|
|            3|       2|       502|       5|         50.0|   250.0|
|            4|       2|       403|       1|       129.99|  129.99|
|            5|       4|       897|       2|        24.99|   49.98|
|            6|       4|       365|       5|        59.99|  299.95|
|            7|       4|       502|       3|         50.0|   150.0|
|            8|       4|      1014|       4|        49.98|  199.92|
|            9|       5|       957|       1|       299.98|  299.98|
|           10|       5|       365|       5|        59.99|  299.95|
|           11|       5|      1014|       2|        49.98|   99.96|
|           12|       5|       957|       1|    

In [24]:
df1.selectExpr("*","product_price * quantity as subtotal").show()

+-------------+--------+----------+--------+-------------+--------+
|order_item_id|order_id|product_id|quantity|product_price|subtotal|
+-------------+--------+----------+--------+-------------+--------+
|            1|       1|       957|       1|       299.98|  299.98|
|            2|       2|      1073|       1|       199.99|  199.99|
|            3|       2|       502|       5|         50.0|   250.0|
|            4|       2|       403|       1|       129.99|  129.99|
|            5|       4|       897|       2|        24.99|   49.98|
|            6|       4|       365|       5|        59.99|  299.95|
|            7|       4|       502|       3|         50.0|   150.0|
|            8|       4|      1014|       4|        49.98|  199.92|
|            9|       5|       957|       1|       299.98|  299.98|
|           10|       5|       365|       5|        59.99|  299.95|
|           11|       5|      1014|       2|        49.98|   99.96|
|           12|       5|       957|       1|    

In [25]:
products_df = spark.read\
.format("csv")\
.option("inferSchema","true")\
.load("/public/trendytech/retail_db/products/part-00000")

products_df.show()

+---+---+--------------------+----+------+--------------------+
|_c0|_c1|                 _c2| _c3|   _c4|                 _c5|
+---+---+--------------------+----+------+--------------------+
|  1|  2|Quest Q64 10 FT. ...|null| 59.98|http://images.acm...|
|  2|  2|Under Armour Men'...|null|129.99|http://images.acm...|
|  3|  2|Under Armour Men'...|null| 89.99|http://images.acm...|
|  4|  2|Under Armour Men'...|null| 89.99|http://images.acm...|
|  5|  2|Riddell Youth Rev...|null|199.99|http://images.acm...|
|  6|  2|Jordan Men's VI R...|null|134.99|http://images.acm...|
|  7|  2|Schutt Youth Recr...|null| 99.99|http://images.acm...|
|  8|  2|Nike Men's Vapor ...|null|129.99|http://images.acm...|
|  9|  2|Nike Adult Vapor ...|null|  50.0|http://images.acm...|
| 10|  2|Under Armour Men'...|null|129.99|http://images.acm...|
| 11|  2|Fitness Gear 300 ...|null|209.99|http://images.acm...|
| 12|  2|Under Armour Men'...|null|139.99|http://images.acm...|
| 13|  2|Under Armour Men'...|null| 89.9

In [26]:
df1=products_df.toDF("product_id","product_category_id","product_name","product_description","product_price","product_image")

In [27]:
df1.show()

+----------+-------------------+--------------------+-------------------+-------------+--------------------+
|product_id|product_category_id|        product_name|product_description|product_price|       product_image|
+----------+-------------------+--------------------+-------------------+-------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|        59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|               null|       129.99|http://images.acm...|
|         3|                  2|Under Armour Men'...|               null|        89.99|http://images.acm...|
|         4|                  2|Under Armour Men'...|               null|        89.99|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|               null|       199.99|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|               null|       134.99|http://images.acm...|
|         7|       

In [28]:
df1.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_category_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_description: string (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_image: string (nullable = true)



In [31]:
df2 = df1.selectExpr("*","product_price+(0.2*product_price) as new_product_price")
df2.show(5)

+----------+-------------------+--------------------+-------------------+-------------+--------------------+-----------------+
|product_id|product_category_id|        product_name|product_description|product_price|       product_image|new_product_price|
+----------+-------------------+--------------------+-------------------+-------------+--------------------+-----------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|        59.98|http://images.acm...|           71.976|
|         2|                  2|Under Armour Men'...|               null|       129.99|http://images.acm...|          155.988|
|         3|                  2|Under Armour Men'...|               null|        89.99|http://images.acm...|          107.988|
|         4|                  2|Under Armour Men'...|               null|        89.99|http://images.acm...|          107.988|
|         5|                  2|Riddell Youth Rev...|               null|       199.99|http://images.acm...|   

In [34]:
df2 = df1.withColumn("product_price",expr("product_price*1.2"))
df2.show(5)

+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|product_id|product_category_id|        product_name|product_description|     product_price|       product_image|
+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|            71.976|http://images.acm...|
|         2|                  2|Under Armour Men'...|               null|           155.988|http://images.acm...|
|         3|                  2|Under Armour Men'...|               null|107.98799999999999|http://images.acm...|
|         4|                  2|Under Armour Men'...|               null|107.98799999999999|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|               null|           239.988|http://images.acm...|
+----------+-------------------+--------------------+-------------------+---------------

In [43]:
df3 = df1.withColumn("product_price", expr("""
    CASE 
        WHEN UPPER(product_name) LIKE '%NIKE%' THEN product_price * 1.2
        WHEN UPPER(product_name) LIKE '%ARMOUR%' THEN product_price * 1.1
        ELSE product_price 
    END
"""))

In [45]:
df3.show(5)

+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|product_id|product_category_id|        product_name|product_description|     product_price|       product_image|
+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|             59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|               null|142.98900000000003|http://images.acm...|
|         3|                  2|Under Armour Men'...|               null|            98.989|http://images.acm...|
|         4|                  2|Under Armour Men'...|               null|            98.989|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|               null|            199.99|http://images.acm...|
+----------+-------------------+--------------------+-------------------+---------------

In [50]:
my_list = [
    (1,"Kapil",34),
    (1,"Kapil",34),
    (1,"Satish",26),
    (2,"Satish",26),
]

In [51]:
df = spark.createDataFrame(my_list).toDF("id","name","age")
df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  1| Kapil| 34|
|  1|Satish| 26|
|  2|Satish| 26|
+---+------+---+



In [55]:
df1=df.distinct()
df1.show()
df.select("id").distinct()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  2|Satish| 26|
|  1|Satish| 26|
+---+------+---+



id
1
2


In [57]:
df2 = df.dropDuplicates(["name","age"])
df2.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  1|Satish| 26|
+---+------+---+



In [58]:
df3 = df.dropDuplicates(["id"])
df3.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  2|Satish| 26|
+---+------+---+

