In [1]:
# SAMPLE DATA
import seaborn as sns
data_tips = sns.load_dataset('tips')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

df = spark.createDataFrame(data_tips)

In [2]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [3]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



## 컬럼 드랍

In [12]:
df_dropped = df.drop('smoker','day') # List가 아닌 문자열 형태로

# Filtering

조건 식을 문자열처럼 활용 가능 / filter와 where 함수는 alias

In [16]:
df.tip > 3

Column<b'(tip > 3)'>

In [17]:
cond = df.tip > 3 # 변수로 지정 가능

In [18]:
df.filter(df.tip > 3).show(3)
# df.filter(cond).show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [19]:
df.filter(df.smoker == "No").show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



참고 : col 활용

In [4]:
from pyspark.sql.functions import col

In [15]:
df.filter( col('tip') > 5 ).show(3)

+----------+----+----+------+---+------+----+
|total_bill| tip| sex|smoker|day|  time|size|
+----------+----+----+------+---+------+----+
|     39.42|7.58|Male|    No|Sat|Dinner|   4|
|      30.4| 5.6|Male|    No|Sun|Dinner|   4|
|      32.4| 6.0|Male|    No|Sun|Dinner|   4|
+----------+----+----+------+---+------+----+
only showing top 3 rows



## Multiple Condition

In [20]:
cond1 = ( df.smoker.isNotNull() ) | ( df.time.isNotNull() )

In [22]:
df.filter(cond1).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [23]:
cond2 = ( df.tip > 3 ) & (df.time == 'Dinner')

In [24]:
df.filter(cond2).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows

