In [1]:
import pandas
import numpy 

import seaborn

planets = seaborn.load_dataset("planets")
planets.shape

(1035, 6)

In [2]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [3]:
randomer = numpy.random.RandomState(42)
series = pandas.Series(randomer.rand(5))
series

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [4]:
series.sum()

2.811925491708157

In [5]:
series.mean()

0.5623850983416314

In [6]:
dataFrame = pandas.DataFrame({
    "A": randomer.rand(5),
    "B": randomer.rand(5)
})
dataFrame

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [8]:
dataFrame.mean(axis = 0)

A    0.477888
B    0.443420
dtype: float64

In [9]:
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [11]:
"""GroupBy: Split, Apply, Combine"""

"""Split: phân tách và nhóm một DataFrame dựa trên giá trị specified key"""
"""Apply: tính toán các function - thường là aggregate, transformation, filtering, trên từng nhóm"""
"""Combine: gộp kết quả của các operations ấy thành một output array"""

dataFrame = pandas.DataFrame({
    "key": ["A", "B", "C", "A", "B", "C"],
    "data": range(6)
}, columns = ["key", "data"])

dataFrame

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [12]:
dataFrame.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6216a686d8>

In [13]:
dataFrame.groupby("key").sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [14]:
"""DataFrameGroupBy object"""
"""Có thể xem nó như một tập hợp DataFrames"""

"""Column indexing"""
planets.groupby("method")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6216a7cba8>

In [15]:
planets.groupby("method")["orbital_period"]



<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f6216a723c8>

In [16]:
"""Một điều quan trọng, chả có tính toán nào được thực hiện cho tới khi chúng ta gọi một vài phương thức trên các GroupBy object này"""
planets.groupby("method")["orbital_period"].median()



method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [17]:
"""Iteration over groups"""
for (method, group) in planets.groupby("method"):
    print("{0:30s} shape = {1}".format(method, group.shape))



Astrometry                     shape = (2, 6)
Eclipse Timing Variations      shape = (9, 6)
Imaging                        shape = (38, 6)
Microlensing                   shape = (23, 6)
Orbital Brightness Modulation  shape = (3, 6)
Pulsar Timing                  shape = (5, 6)
Pulsation Timing Variations    shape = (1, 6)
Radial Velocity                shape = (553, 6)
Transit                        shape = (397, 6)
Transit Timing Variations      shape = (4, 6)


In [18]:
"""Dispatch methods"""
"""Các phương thức không được triển khai minh bạch bởi GroupBy object sẽ được truyền và gọi trên các groups - tức là các DataFrames hoặc S"""
planets.groupby("method")["year"].describe().unstack()


       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

In [2]:
"""Aggregate, filter, transform, apply"""

randomer = numpy.random.RandomState(0)
dataFrame = pandas.DataFrame({
    "key": ["A", "B", "C", "A", "B", "C"],
    "data1": range(6),
    "data2": randomer.randint(0, 10, 6)
}, columns = ["key", "data1", "data2"])

dataFrame

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [3]:
"""Aggregation """
"""aggregate() method cho phép chúng ta aggregate linh hoạt hơn nhiều"""
dataFrame.groupby("key").aggregate(["min", numpy.median, "max"])


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [4]:
"""Ngoài truyền mảng các phép tập hợp chúng ta cũng có thể truyền dictionary để xác định các operations ta muốn apply cho từng cột"""
dataFrame.groupby("key").aggregate({
    "data1": "min",
    "data2": "max"
})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [6]:
"""Filtering"""
"""Cho phép drop các data dựa trên group properties"""
def filterFunc(group):
    return group["data2"].std() > 4

dataFrame.groupby("key").filter(filterFunc)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


In [7]:

"""Transformation"""

dataFrame.groupby("key").transform(lambda group: group - group.mean())

"""apply() method"""
"""Cho phép apply một function tùy ý tới các groups"""
"""Bạn có thể trả về hoặc là Pandas object(DataFrame, Series) hoặc một scalar"""
def normByData2(group):
    group["data1"] /= group["data2"].sum()
    return group
dataFrame.groupby("key").apply(normByData2)

Unnamed: 0,key,data1,data2
0,A,0.0,5
1,B,0.142857,0
2,C,0.166667,3
3,A,0.375,3
4,B,0.571429,7
5,C,0.416667,9


In [8]:
"""Xác định split key"""

"""List, array, series thể hiện group cho mỗi hàng"""
"""Miễn là độ dài khớp với DataFrame đó"""
groupingKeys = [0, 1, 0, 1, 2, 0]
dataFrame.groupby(groupingKeys).sum()

Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [9]:
dataFrame.groupby(dataFrame["key"]).sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,5,7
C,7,12


In [10]:
"""MỘt dictionary hoặc series để ánh xạ index value sang group"""
dataFrame2 = dataFrame.set_index("key")
mapping = {"A": "vowel", "B": "consonant", "C": "consonant"}
dataFrame2.groupby(mapping).sum()

Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


In [11]:
"""Any python function nào nhận đầu vào là index value và đầu ra là group"""

dataFrame2.groupby(str.lower).mean()

Unnamed: 0,data1,data2
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


In [12]:
"""Hoặc một list các cái thằng phía trên kết hợp lại"""
dataFrame2.groupby([str.lower, mapping]).mean()

Unnamed: 0,Unnamed: 1,data1,data2
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


In [13]:
decade = 10 * (planets["year"] // 10)
decade = decade.astype(str) + "s"
decade.name = "decade"
print(decade)
planets.groupby(["method", decade])["number"].sum().unstack().fillna(0)

0       2000s
1       2000s
2       2010s
3       2000s
4       2000s
        ...  
1030    2000s
1031    2000s
1032    2000s
1033    2000s
1034    2000s
Name: decade, Length: 1035, dtype: object


decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
