In [1]:
# SAMPLE DATA
import seaborn as sns
data_tips = sns.load_dataset('tips')

## Option

In [44]:
import pyspark

In [53]:
pyspark.SparkConf().getAll()

[('spark.app.name', 'Learning_Spark'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

## 실행 설정

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

In [4]:
df = spark.createDataFrame(data_tips)

# 데이터프레임 정보

In [5]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [6]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [7]:
df.count(), len(df.columns)

(244, 7)

In [8]:
df.describe(['total_bill','size']).show()

+-------+------------------+------------------+
|summary|        total_bill|              size|
+-------+------------------+------------------+
|  count|               244|               244|
|   mean|19.785942622950813| 2.569672131147541|
| stddev| 8.902411954856856|0.9510998047322344|
|    min|              3.07|                 1|
|    max|             50.81|                 6|
+-------+------------------+------------------+



In [9]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [64]:
df.take(3) # List로 출력 ( ROW타입은 dict처럼 활용 가능 )

[Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3)]

In [10]:
df.select("tip","day","size").show(5, truncate=False)
# 리스트 형태도 가능

+----+---+----+
|tip |day|size|
+----+---+----+
|1.01|Sun|2   |
|1.66|Sun|3   |
|3.5 |Sun|3   |
|3.31|Sun|2   |
|3.61|Sun|4   |
+----+---+----+
only showing top 5 rows



In [11]:
df.groupBy('sex') \
    .count() \
    .orderBy('count', ascending=False) \
    .show()

+------+-----+
|   sex|count|
+------+-----+
|  Male|  157|
|Female|   87|
+------+-----+



# Type Casting

In [12]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [13]:
from pyspark.sql.types import IntegerType
# FloatType, DoubleType, ...

In [14]:
# 컬럼 이름 오타가 나면 새로운 컬럼이 생기므로 주의
df2 = df.withColumn('size', df['size'].cast(IntegerType()))

In [15]:
df2.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)

