In [2]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("./data/retail-data/by-day/2010-12-01.csv")

In [3]:
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [4]:
# 스파크 데이터 타입으로 변환

from pyspark.sql.functions import lit

df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [5]:
# 1. 불리언

from pyspark.sql.functions import col

df.where(col("InvoiceNo")!= 536365).select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [6]:
df.where("InvoiceNo = 536365").show(5, false)
df.where("InvoiceNo <> 536365").show(5, false)

NameError: name 'false' is not defined

In [None]:
# and는 차례대로 필터를 적용
# or는 반드시 동일 구문
from pyspark.sql.functions import instr

priceFilter = col('UnitPrice')> 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

In [None]:
# 2. 수치형 데이터

from pyspark.sql.functions import expr, pow

fabricatedQuantity = pow(col('Quantity')* col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias('realQuantity')).show(2)

In [None]:
df.describe().show()

In [None]:
olName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05

df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

In [None]:
df.stat.crosstab("StockCode", "Quantity").show()

In [None]:
# 3. 문자열 

from pyspark.sql.functions import initcap

df.select(initcap(col("Description"))).show()

In [None]:
from pyspark.sql.functions import lower, upper

df.select(col("Description"),lower(col("Description"))).show(2)

In [None]:
from pyspark.sql.functions import reqexp_replace

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(reqexp_replace("Description")).show(2)

In [None]:
# 날짜 timestamp

from pyspark.sql.functions import current_date,  current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

In [None]:
from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

In [None]:
# null 다루기

# coalesce: 인수로 지정한 여러 컬럼 중 null이 아닌 첫번째 값 반환
from pyspark.sql.functions import coalesce

df.select(coalesce(col("Description"), col("CustomerId"))).show()

In [None]:
# 구조체
from pyspark.sql.functions import struct

complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [None]:
complexDF.select("complex.Description")

In [None]:
# 배열

from pyspark.sql.functions import split

df.select(split(col("Description"), " ")).show(2)

In [None]:
# JSON

jsonDF = spark.range(1).selectExpr("""
    '{"myJSONKey": {"myJSONValue" : [1,2,3]}}' as jsonString
""")

In [None]:
jsonDF

In [None]:
udfExampleDF = spark.raange(5).toDF("num")

def power3(double_value):
    return double_value ** 3
power3(2.0)

In [None]:
from pyspark.sql.functions import udf

power3udf = udf(power3)

udfExampleDF.selectExpr("power3(num)").show(2)

In [None]:
from pyspark.sql.functions import to_timestamp

cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

In [17]:
dateFormat = "yyyy-MM-dd"
spark.range(1).select(to_timestamp(lit("2021-12-28"), dateFormat)).show()

+------------------------------------+
|to_timestamp(2021-12-28, yyyy-MM-dd)|
+------------------------------------+
|                 2021-12-28 00:00:00|
+------------------------------------+

