#### Ecommerce Data
This is a data set from UK-based and registered non-store online retail company. This data set contains all the transactions occurring between 01/12/2010 and 09/12/2011.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers."

In [0]:
%%capture
!pip install --upgrade pip
!pip install wget

In [0]:
import wget
from zipfile import ZipFile
from pyspark.sql.functions import col, instr

In [0]:
wget.download('https://archive.org/download/data.csv_202205/data.csv.zip','/tmp/data.csv.zip')


Out[15]: '/tmp/data.csv (1).zip'

In [0]:
with ZipFile('/tmp/data.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/tmp/')

In [0]:
rm /tmp/data.csv.zip

In [0]:
dbutils.fs.mv("file:/tmp/data.csv", "dbfs:/retail/data.csv")

Out[29]: True

In [0]:
df = spark.read.csv('dbfs:/retail',inferSchema=True,header=True)

In [0]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
display(df
        .filter(col('InvoiceNo') != 536365)
        .select(col('InvoiceNo'),col('Description'))
)

InvoiceNo,Description
536366,HAND WARMER UNION JACK
536366,HAND WARMER RED POLKA DOT
536367,ASSORTED COLOUR BIRD ORNAMENT
536367,POPPY'S PLAYHOUSE BEDROOM
536367,POPPY'S PLAYHOUSE KITCHEN
536367,FELTCRAFT PRINCESS CHARLOTTE DOLL
536367,IVORY KNITTED MUG COSY
536367,BOX OF 6 ASSORTED COLOUR TEASPOONS
536367,BOX OF VINTAGE JIGSAW BLOCKS
536367,BOX OF VINTAGE ALPHABET BLOCKS


In [0]:
df.createOrReplaceTempView('retail')

In [0]:
%sql
select * from retail limit 10

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom
536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850,United Kingdom
536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850,United Kingdom
536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850,United Kingdom
536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850,United Kingdom
536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047,United Kingdom


In [0]:
pricefilter = col('UnitPrice') > 600
descriptFilter = instr(col('Description'), "POSTAGE") >= 1 #Contem POSTAGE, mas não no inicio da Description

In [0]:
display(df
        .filter(
        (col('StockCode').isin('DOT')) &
        (pricefilter | descriptFilter)
        )
)

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
536544,DOT,DOTCOM POSTAGE,1,12/1/2010 14:32,569.77,,United Kingdom
536592,DOT,DOTCOM POSTAGE,1,12/1/2010 17:06,607.49,,United Kingdom
536862,DOT,DOTCOM POSTAGE,1,12/3/2010 11:13,254.43,,United Kingdom
536864,DOT,DOTCOM POSTAGE,1,12/3/2010 11:27,121.06,,United Kingdom
536865,DOT,DOTCOM POSTAGE,1,12/3/2010 11:28,498.47,,United Kingdom
536876,DOT,DOTCOM POSTAGE,1,12/3/2010 11:36,887.52,,United Kingdom
537237,DOT,DOTCOM POSTAGE,1,12/6/2010 9:58,863.74,,United Kingdom
537240,DOT,DOTCOM POSTAGE,1,12/6/2010 10:08,940.87,,United Kingdom
537434,DOT,DOTCOM POSTAGE,1,12/6/2010 16:57,950.99,,United Kingdom
537638,DOT,DOTCOM POSTAGE,1,12/7/2010 15:28,836.14,,United Kingdom


In [0]:
df = df.withColumn('isExpensive', col('UnitPrice')>600)

In [0]:
display(df
        .filter(
        (col('StockCode') == 'DOT') &
        (descriptFilter) &
        (col('isExpensive') == True)
        )
)

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,isExpensive
536592,DOT,DOTCOM POSTAGE,1,12/1/2010 17:06,607.49,,United Kingdom,True
536876,DOT,DOTCOM POSTAGE,1,12/3/2010 11:36,887.52,,United Kingdom,True
537237,DOT,DOTCOM POSTAGE,1,12/6/2010 9:58,863.74,,United Kingdom,True
537240,DOT,DOTCOM POSTAGE,1,12/6/2010 10:08,940.87,,United Kingdom,True
537434,DOT,DOTCOM POSTAGE,1,12/6/2010 16:57,950.99,,United Kingdom,True
537638,DOT,DOTCOM POSTAGE,1,12/7/2010 15:28,836.14,,United Kingdom,True
537645,DOT,DOTCOM POSTAGE,1,12/7/2010 15:34,607.96,,United Kingdom,True
537666,DOT,DOTCOM POSTAGE,1,12/7/2010 18:36,701.95,,United Kingdom,True
537823,DOT,DOTCOM POSTAGE,1,12/8/2010 14:25,729.42,,United Kingdom,True
538071,DOT,DOTCOM POSTAGE,1,12/9/2010 14:09,885.94,,United Kingdom,True
