# Data Wrangling  :: Join, combine และ reshape/rearrange ข้อมูล

In [5]:
import pandas as pd
import numpy as np

### เริ่มจากข้อมูลที่ซับซ้อนขึ้นใน Series ของ Pandas

In [7]:
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1   -2.255926
   2   -0.102143
   3    0.153486
b  1   -0.818347
   3    1.189137
c  1   -0.025854
   2   -2.664422
d  2   -1.907950
   3   -0.857129
dtype: float64

In [8]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [9]:
data['a']

1   -2.255926
2   -0.102143
3    0.153486
dtype: float64

In [12]:
data['b':'d']

b  1   -0.818347
   3    1.189137
c  1   -0.025854
   2   -2.664422
d  2   -1.907950
   3   -0.857129
dtype: float64

In [13]:
data.loc[['b','d']]

b  1   -0.818347
   3    1.189137
d  2   -1.907950
   3   -0.857129
dtype: float64

In [15]:
data.loc[:,2]

a   -0.102143
c   -2.664422
d   -1.907950
dtype: float64

### เราสามารถแปลงโครงสร้างข้อมูลในรูปแบบลำดับชั้นของ index (Pivot table) ให้อยู่ในรูปแบบ DataFrame ได้ดังนี้

In [22]:
data = data.unstack()
type(data)

pandas.core.frame.DataFrame

In [23]:
data

Unnamed: 0,a,b,c,d
1,-2.255926,-0.818347,-0.025854,
2,-0.102143,,-2.664422,-1.90795
3,0.153486,1.189137,,-0.857129


In [24]:
# Put your  code

1  a   -2.255926
   b   -0.818347
   c   -0.025854
2  a   -0.102143
   c   -2.664422
   d   -1.907950
3  a    0.153486
   b    1.189137
   d   -0.857129
dtype: float64

### มาดูข้อมูลซับซ้อนใน DataFrame กันบ้าง

In [66]:
data = frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                            index=[['group 1', 'group 1', 'group 2', 'group 2'], [1, 2, 1, 2]],
                            columns=[['Bangkok', 'Bangkok', 'CNX', 'CNX'],
                                     ['Green', 'Red', 'Green', 'Red']])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green,Red
group 1,1,0,1,2,3
group 1,2,4,5,6,7
group 2,1,8,9,10,11
group 2,2,12,13,14,15


In [67]:
data.index

MultiIndex(levels=[['group 1', 'group 2'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [68]:
data.columns

MultiIndex(levels=[['Bangkok', 'CNX'], ['Green', 'Red']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [69]:
data.index.names = [ 'Key 1', 'Key 2' ]
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green,Red
Key 1,Key 2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
group 1,1,0,1,2,3
group 1,2,4,5,6,7
group 2,1,8,9,10,11
group 2,2,12,13,14,15


In [70]:
data.columns.names = ['Provice name', 'Color']
data

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 1,Key 2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
group 1,1,0,1,2,3
group 1,2,4,5,6,7
group 2,1,8,9,10,11
group 2,2,12,13,14,15


In [71]:
# Access data in DataFrame
data.Bangkok

Unnamed: 0_level_0,Color,Green,Red
Key 1,Key 2,Unnamed: 2_level_1,Unnamed: 3_level_1
group 1,1,0,1
group 1,2,4,5
group 2,1,8,9
group 2,2,12,13


In [72]:
data['Bangkok']

Unnamed: 0_level_0,Color,Green,Red
Key 1,Key 2,Unnamed: 2_level_1,Unnamed: 3_level_1
group 1,1,0,1
group 1,2,4,5
group 2,1,8,9
group 2,2,12,13


### Reorder and sort data in DataFrame

In [76]:
data.swaplevel('Key 1', 'Key 2')

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 2,Key 1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,group 1,0,1,2,3
2,group 1,4,5,6,7
1,group 2,8,9,10,11
2,group 2,12,13,14,15


In [85]:
data.sort_index(level=0)

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 1,Key 2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
group 1,1,0,1,2,3
group 1,2,4,5,6,7
group 2,1,8,9,10,11
group 2,2,12,13,14,15


In [86]:
data.sort_index(level=1)


Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 1,Key 2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
group 1,1,0,1,2,3
group 2,1,8,9,10,11
group 1,2,4,5,6,7
group 2,2,12,13,14,15


In [88]:
# Put your code to ordering 

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 1,Key 2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
group 2,2,12,13,14,15
group 1,2,4,5,6,7
group 2,1,8,9,10,11
group 1,1,0,1,2,3


In [93]:
data.swaplevel(0,1)

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 2,Key 1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,group 1,0,1,2,3
2,group 1,4,5,6,7
1,group 2,8,9,10,11
2,group 2,12,13,14,15


In [95]:
data.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 2,Key 1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,group 1,0,1,2,3
1,group 2,8,9,10,11
2,group 1,4,5,6,7
2,group 2,12,13,14,15


In [98]:
data.swaplevel(0,1).sort_index(level=0, ascending=False)

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 2,Key 1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,group 2,12,13,14,15
2,group 1,4,5,6,7
1,group 2,8,9,10,11
1,group 1,0,1,2,3


### Summary data

In [99]:
data

Unnamed: 0_level_0,Provice name,Bangkok,Bangkok,CNX,CNX
Unnamed: 0_level_1,Color,Green,Red,Green,Red
Key 1,Key 2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
group 1,1,0,1,2,3
group 1,2,4,5,6,7
group 2,1,8,9,10,11
group 2,2,12,13,14,15


In [105]:
# Sum by level = Key 1
data.sum(level='Key 1')

Provice name,Bangkok,Bangkok,CNX,CNX
Color,Green,Red,Green,Red
Key 1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
group 1,4,6,8,10
group 2,20,22,24,26


In [104]:
# Sum by level = Key 2
data.sum(level='Key 2')

Provice name,Bangkok,Bangkok,CNX,CNX
Color,Green,Red,Green,Red
Key 2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,8,10,12,14
2,16,18,20,22


In [102]:
# Sum by level = Color
data.sum(level='Color', axis=1)

Unnamed: 0_level_0,Color,Green,Red
Key 1,Key 2,Unnamed: 2_level_1,Unnamed: 3_level_1
group 1,1,2,4
group 1,2,10,12
group 2,1,18,20
group 2,2,26,28


### จัดการ Row index/ Column ของ DataFrame ตามความต้องการ
* การสร้าง index
* การ reset index

In [107]:
data = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                     'c': ['one', 'one', 'one', 'two', 'two',
                           'two', 'two'],
                     'd': [0, 1, 2, 0, 1, 2, 3]})
data

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


ทำการรวม column c และ d เข้าด้วยกันเป็น index ของ DataFrame

In [111]:
data_new = data.set_index(['c', 'd'])
data_new

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


ผลการสร้าง index จาก column c และ d คือ column c และ d หายไปจากข้อมูล

แต่ถ้าเราไม่ต้องการให้ข้อมูลหายไป สามารถทำได้ด้วยการกำหนดค่า drop=False ไปดังนี้

In [117]:
data_new = data.set_index(['c', 'd'], drop=False)
data_new

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


### ถ้าต้องการ reset index สามารถทำได้เช่นกัน

แต่ column/index ห้ามซ้ำ !!!

In [120]:
data_new = data.set_index(['c', 'd'])
data_new.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
