In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-dataframe").getOrCreate()

파일이나 다른 데이터 소스로 부터 스파크 데이터 프레임을 만드는 방법
* `spark.read.xxx(DataSource 경로)`

In [2]:
!pwd

/home/lab26/SparkCourse


In [3]:
directory = "/home/lab26/SparkCourse/data"
filename = "titanic_train.csv"

Pandas로 csv 데이터 불러오기

In [4]:
import pandas as pd

titanic_pdf = pd.read_csv(f"{directory}/{filename}", header='infer')
titanic_pdf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Spark로 csv 데이터 불러오기

In [5]:
# header=True : csv에 기록된 컬럼 정보 포함하기
# inferSchema=True : 데이터 타입 자동 유추
titanic_sdf = spark.read.csv(f"file:///{directory}/{filename}", header=True, inferSchema=True)
titanic_sdf

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

In [6]:
titanic_sdf.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

# Pandas DataFrame 과 Spark DataFrame의 주요차이

## Spark DataFrame은 SQL 연산과 비슷한 연산자를 제공
- `spark_dataframe.select('컬럼명')`
- `spark_dataframe.select('컬럼명').filter(...)`
    - `filter` : where절에 해당!
- `spark_dataframe.groupBy('컬럼명').count()`
- `spark_dataframe.withColumns('컬럼명', ...)`

## Spark Dataframe의 연산의 특징
- Spark DataFrame의 연산은 대부분 새로운 DataFrame 객체를 반환하는 형태로 구성
- 특히 DataFrame 객체에 직접 수정을 허용하지 않는다.
    - Spark DataFrame도 RDD의 Immutable 특징을 그대로 가져간다.
- **pandas의 경우**
    - `pandas_dataframe.drop('컬럼명', axis=1 ,inplace=True)` 호출하면 `pandas_dataframe` 객체 자체에서 `'컬럼명'`을 `drop` 시킨다.
- **Spark Dataframe의 경우**
    - `spark_dataframe_new = spark_dataframe.drop('컬럼명')`과 같이 `inplace`인자가 아예 없음
    

## Spark Dataframe은 `[ ]` 연산자 활용이 제한적이다.
- **Pandas의 경우**
    - 특정 컬럼값을 가져오거나, 새로운 컬럼을 만들기 위해서 사용
    - `pandas_dataframe['new_column'] = pandas_dataframe['column'] * 10`
- **Spark DataFrame의 경우**
    - `withColumns()` 메소드를 활용해야 한다.
        - `update` 효과가 있다.
    - `spark_dataframe.withColumns('new_column', col('column') * 10 )`
    - `withColumns()`, `filters()` 메소드에서 컬럼을 지정하기 위해서만 사용

In [7]:
print(titanic_pdf.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [8]:
titanic_sdf.head(10)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S'),
 Row(PassengerId=6, Survived=0, Pclass=3, Name='Moran, Mr. James', Sex='male',

In [9]:
# Not null 카운트 확인하기
from pyspark.sql.functions import count, isnan, when, col

titanic_sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in titanic_sdf.columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [10]:
dict_01 = {'Name': ['민석', '민호','성현','현주', '상기'],
           'Year': [2011, 2016, 2015, 2015, 2011],
           'Gender': ['Male', 'Male', 'Male', 'Female', 'Male']
          }

In [12]:
data_pdf = pd.DataFrame(dict_01)
data_sdf = spark.createDataFrame(data_pdf) # 판다스 데이터 프레임을 스파크 데이터 프레임으로 만들기

In [21]:
from pyspark.sql.functions import col 

# col() 함수를 이용하여 명시적으로 컬럼명을 지정할 수 있음.
data_sdf.select(col("Name"), col("Year")).show()

+----+----+
|Name|Year|
+----+----+
|민석|2011|
|민호|2016|
|성현|2015|
|현주|2015|
|상기|2011|
+----+----+



In [14]:
from pyspark.sql.functions import upper, lower, col

# select()에서 컬럼 데이터를 가공 후 생성 가능.
data_sdf.select("*", upper(col("Gender"))).show() # select *, upper(Gender) from data_sdf
data_sdf.select("*", upper(col("Gender")).alias("CAP_GENDER")).show() # select *, upper(Gender) as CAP_GENDER from data_sdf

+----+----+------+-------------+
|Name|Year|Gender|upper(Gender)|
+----+----+------+-------------+
|민석|2011|  Male|         MALE|
|민호|2016|  Male|         MALE|
|성현|2015|  Male|         MALE|
|현주|2015|Female|       FEMALE|
|상기|2011|  Male|         MALE|
+----+----+------+-------------+

+----+----+------+----------+
|Name|Year|Gender|CAP_GENDER|
+----+----+------+----------+
|민석|2011|  Male|      MALE|
|민호|2016|  Male|      MALE|
|성현|2015|  Male|      MALE|
|현주|2015|Female|    FEMALE|
|상기|2011|  Male|      MALE|
+----+----+------+----------+



## Spark DataFrame 의 filter 메소드


In [16]:
dict_01 = {'Name': ['민석', '민호','성현','현주', '상기'],
           'Year': [2011, 2016, 2015, 2015, 2011],
           'Gender': ['Male', 'Male', 'Male', 'Female', 'Male']
          }

# 딕셔너리를 판다스 데이터프레임으로 변환
data_pdf = pd.DataFrame(dict_01)

# 판다스 데이터프레임을 스파크 데이터프레임으로 바꾸기
data_sdf = spark.createDataFrame(data_pdf)

In [None]:
data_sdf.filter('Name'=='민호') # Error!!!

In [18]:
# SQL의 where절 처럼 사용하면된다.
data_sdf.filter("Name = '민호'").show()

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민호|2016|  Male|
+----+----+------+



In [19]:
data_sdf.filter(data_sdf['Name']=='민호').show()

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민호|2016|  Male|
+----+----+------+



In [25]:
data_sdf.filter(col('Name')=='민호').show() # 이 방법이 비교적 많이 사용됨.

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민호|2016|  Male|
+----+----+------+



   **복합조건 사용하기 (`and(&)` , `or(|)`)**

In [23]:
data_sdf.filter( (data_sdf["Gender"] == "Male") & (col("Year") > 2011)).show() #괄호를 사용해 조건들을 한번 더 묶어주는게 좋다

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민호|2016|  Male|
|성현|2015|  Male|
+----+----+------+



In [26]:
data_sdf.filter( (data_sdf["Gender"] == "Male") | (col("Year") > 2011)).show() # and와 or를 같이 쓰는경우 or 부분을 괄호로 묶어준다. and의 우선순위가 더 높기때문

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민석|2011|  Male|
|민호|2016|  Male|
|성현|2015|  Male|
|현주|2015|Female|
|상기|2011|  Male|
+----+----+------+



In [33]:
# 문자열 컬럼의 like 조건 수행
data_sdf.filter(col('Name').like("민%")).show()

data_sdf.filter("Name like '민%'").show()
data_sdf.filter("upper(Gender) like '%A%'").show()

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민석|2011|  Male|
|민호|2016|  Male|
+----+----+------+

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민석|2011|  Male|
|민호|2016|  Male|
+----+----+------+

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민석|2011|  Male|
|민호|2016|  Male|
|성현|2015|  Male|
|현주|2015|Female|
|상기|2011|  Male|
+----+----+------+



In [34]:
from pyspark.sql.functions import upper
data_sdf.filter(upper(data_sdf["Gender"]).like("%A%")).show()

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민석|2011|  Male|
|민호|2016|  Male|
|성현|2015|  Male|
|현주|2015|Female|
|상기|2011|  Male|
+----+----+------+



In [35]:
# 필터링 후에 특정 컬럼 select
# 무조건 필터링 후에 컬럼선택하는게 베스트임.
data_sdf.filter(upper(data_sdf["Gender"]).like("%A%")).select("Name" , "Year").show()

+----+----+
|Name|Year|
+----+----+
|민석|2011|
|민호|2016|
|성현|2015|
|현주|2015|
|상기|2011|
+----+----+



# order by 

In [37]:
# 판다스 데이터 프레임에서는 sort_values
titanic_pdf_sorted_01 = titanic_pdf.sort_values(by=["Name"], ascending =True)

titanic_pdf_sorted_01

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.5500,,S
746,747,0,3,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.2500,,S
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.2500,,S
308,309,0,2,"Abelson, Mr. Samuel",male,30.0,1,0,P/PP 3381,24.0000,,C
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
286,287,1,3,"de Mulder, Mr. Theodore",male,30.0,0,0,345774,9.5000,,S
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5000,,S
361,362,0,2,"del Carlo, Mr. Sebastiano",male,29.0,1,0,SC/PARIS 2167,27.7208,,C
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5000,,S


In [39]:
# 기준을 여러개 두기
titanic_pdf_sorted_02 = titanic_pdf.sort_values(by=["Pclass","Name"], ascending =False)
titanic_pdf_sorted_02

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.50,0,2,A/5. 851,14.5000,,S
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.00,0,0,345778,9.5000,,S
286,287,1,3,"de Mulder, Mr. Theodore",male,30.00,0,0,345774,9.5000,,S
559,560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.00,1,0,345572,17.4000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
460,461,1,1,"Anderson, Mr. Harry",male,48.00,0,0,19952,26.5500,E12,S
498,499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.00,1,2,113781,151.5500,C22 C26,S
297,298,0,1,"Allison, Miss. Helen Loraine",female,2.00,1,2,113781,151.5500,C22 C26,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S


In [40]:
# 정렬의 형식도 여러개 지정이 가능!
titanic_pdf_sorted_03 = titanic_pdf.sort_values(by=["Pclass","Name"], ascending =[True, False])
titanic_pdf_sorted_03

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
325,326,1,1,"Young, Miss. Marie Grice",female,36.0,0,0,PC 17760,135.6333,C32,C
555,556,0,1,"Wright, Mr. George",male,62.0,0,0,113807,26.5500,,S
55,56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5000,C52,S
351,352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35.0000,C128,S
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
401,402,0,3,"Adams, Mr. John",male,26.0,0,0,341826,8.0500,,S
365,366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30.0,0,0,C 7076,7.2500,,S
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.2500,,S
746,747,0,3,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.2500,,S


In [43]:
from pyspark.sql.functions import col

titanic_sdf.orderBy("Name", ascending=False).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+--------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+--------+-----+--------+
|        869|       0|     3|van Melkebeke, Mr...|  male|null|    0|    0|          345777|     9.5| null|       S|
|        154|       0|     3|van Billiard, Mr....|  male|40.5|    0|    2|        A/5. 851|    14.5| null|       S|
|        362|       0|     2|del Carlo, Mr. Se...|  male|29.0|    1|    0|   SC/PARIS 2167| 27.7208| null|       C|
|        283|       0|     3|de Pelsmaeker, Mr...|  male|16.0|    0|    0|          345778|     9.5| null|       S|
|        287|       1|     3|de Mulder, Mr. Th...|  male|30.0|    0|    0|          345774|     9.5| null|       S|
|        560|       1|     3|de Messemaeker, M...|female|36.0|    1|    

In [44]:
# 직접 컬럼을 선택해서 내림차순 정렬
# 직접 컬럼을 선택해서 내림 차순 정렬
titanic_sdf.orderBy(titanic_sdf["Name"], ascending=True).show()
titanic_sdf.orderBy(titanic_sdf.Name, ascending=True).show()
titanic_sdf.orderBy(col("Name"), ascending=True).show() # col을 활용하는 방식이 괜찮은 방식임.

+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-----+--------+
|        147|       1|     3|"Andersson, Mr. A...|  male|27.0|    0|    0|    350043| 7.7958| null|       S|
|        519|       1|     2|"Angle, Mrs. Will...|female|36.0|    1|    0|    226875|   26.0| null|       S|
|        291|       1|     1|"Barber, Miss. El...|female|26.0|    0|    0|     19877|  78.85| null|       S|
|        625|       0|     3|"Bowen, Mr. David...|  male|21.0|    0|    0|     54636|   16.1| null|       S|
|        508|       1|     1|"Bradley, Mr. Geo...|  male|null|    0|    0|    111427|  26.55| null|       S|
|        346|       1|     2|"Brown, Miss. Ame...|female|24.0|    0|    0|    248733|   13.0|  F33|       S|
|        209|      

In [45]:
titanic_sdf.orderBy("Pclass", "Name", ascending=False).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|        869|       0|     3|van Melkebeke, Mr...|  male|null|    0|    0|          345777|    9.5| null|       S|
|        154|       0|     3|van Billiard, Mr....|  male|40.5|    0|    2|        A/5. 851|   14.5| null|       S|
|        283|       0|     3|de Pelsmaeker, Mr...|  male|16.0|    0|    0|          345778|    9.5| null|       S|
|        287|       1|     3|de Mulder, Mr. Th...|  male|30.0|    0|    0|          345774|    9.5| null|       S|
|        560|       1|     3|de Messemaeker, M...|female|36.0|    1|    0|          345572|   17.4| null|       S|
|        423|       0|     3|  Zimmerman, Mr. Leo|  male|29.0|    0|    0|      

In [47]:
# 데이터를 추출하고 (select) 정렬
titanic_sdf.select(col("Pclass"), col("Name")).orderBy(col("Pclass").asc(), col("Name").desc()).show()

# 정렬 먼저하고 추출
titanic_sdf.orderBy(col("Pclass").asc(), col("Name").desc()).select(col("Pclass"), col("Name")).show()

+------+--------------------+
|Pclass|                Name|
+------+--------------------+
|     1|Young, Miss. Mari...|
|     1|  Wright, Mr. George|
|     1|   Woolner, Mr. Hugh|
|     1|Williams-Lambert,...|
|     1|Williams, Mr. Cha...|
|     1|Widener, Mr. Harr...|
|     1|Wick, Mrs. George...|
|     1|Wick, Miss. Mary ...|
|     1|White, Mr. Richar...|
|     1|White, Mr. Perciv...|
|     1|     Weir, Col. John|
|     1|Warren, Mrs. Fran...|
|     1|    Ward, Miss. Anna|
|     1|Walker, Mr. Willi...|
|     1|Van der hoef, Mr....|
|     1|Uruchurtu, Don. M...|
|     1|Thorne, Mrs. Gert...|
|     1|Thayer, Mrs. John...|
|     1|Thayer, Mr. John ...|
|     1|Thayer, Mr. John ...|
+------+--------------------+
only showing top 20 rows

+------+--------------------+
|Pclass|                Name|
+------+--------------------+
|     1|Young, Miss. Mari...|
|     1|  Wright, Mr. George|
|     1|   Woolner, Mr. Hugh|
|     1|Williams-Lambert,...|
|     1|Williams, Mr. Cha...|
|     1|Widene

## Spark DataFrame의 aggregation 메소드 적용


In [48]:
from pyspark.sql.functions import max,sum,min # 반드시 컬럼지정이 필요하다. select절에서!!

titanic_sdf_max = titanic_sdf.select(max("Age"))
titanic_sdf_max.show()

+--------+
|max(Age)|
+--------+
|    80.0|
+--------+



## Spark DataFrame의 groupBy()

In [49]:
titanic_pdf_groupby = titanic_pdf.groupby(by="Pclass")

# agg 함수를 활용해서 각 컬럼에 대한 집계를 따로따로 수행
agg_format = {
    "Age": "max",
    "SibSp": "sum",
    "Fare": "mean"
}

titanic_pdf_groupby.agg(agg_format)

Unnamed: 0_level_0,Age,SibSp,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,90,84.154687
2,70.0,74,20.662183
3,74.0,302,13.67555


In [50]:
# values_counts : Series에 적용시 해당 시리즈 값 별로 건수를 구한다
titanic_pdf["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [51]:
titanic_sdf.groupBy("Pclass").count().show()

+------+-----+
|Pclass|count|
+------+-----+
|     1|  216|
|     3|  491|
|     2|  184|
+------+-----+



In [59]:
# Pclass 카운트 내림차순 정렬
titanic_sdf.groupBy("Pclass").count().orderBy("count" , ascending=False).show()

+------+-----+
|Pclass|count|
+------+-----+
|     3|  491|
|     1|  216|
|     2|  184|
+------+-----+



In [60]:
titanic_sdf.groupBy("Pclass").max().show()

+------+----------------+-------------+-----------+--------+----------+----------+---------+
|Pclass|max(PassengerId)|max(Survived)|max(Pclass)|max(Age)|max(SibSp)|max(Parch)|max(Fare)|
+------+----------------+-------------+-----------+--------+----------+----------+---------+
|     1|             890|            1|          1|    80.0|         3|         4| 512.3292|
|     3|             891|            1|          3|    74.0|         8|         6|    69.55|
|     2|             887|            1|          2|    70.0|         3|         3|     73.5|
+------+----------------+-------------+-----------+--------+----------+----------+---------+



In [64]:
# 쿼리로 할땐 원래는 groupBy에 대한 조건이니 having을 써야하는데 스파크에는 해빙이 없다.!!!
from pyspark.sql.functions import max,sum,min,avg
titanic_sdf.filter("Age > 70 ").groupBy("Pclass").agg(max("Age"), min("Age"),sum("Age"),avg("Age")).show() # 내가 한 버전
 
titanic_sdf.groupBy("Pclass").agg(
    max("Age").alias("max_age"),
    min("Age").alias("min_age"),
    sum("Age").alias("sum_age"),
    avg("Age").alias("avg_age")
).filter(col("max_age") > 70).show()  # 강사님 버전

+------+--------+--------+--------+--------+
|Pclass|max(Age)|min(Age)|sum(Age)|avg(Age)|
+------+--------+--------+--------+--------+
|     1|    80.0|    71.0|   222.0|    74.0|
|     3|    74.0|    70.5|   144.5|   72.25|
+------+--------+--------+--------+--------+

+------+-------+-------+-------+------------------+
|Pclass|max_age|min_age|sum_age|           avg_age|
+------+-------+-------+-------+------------------+
|     1|   80.0|   0.92|7111.42|38.233440860215055|
|     3|   74.0|   0.42|8924.92| 25.14061971830986|
+------+-------+-------+-------+------------------+



In [65]:
spark.stop()