# Pandas for DataFrame manipulation

In [26]:
import numpy as np 
import pandas as pd

In [27]:
# buat dictionary untuk datanya
dict_salary = {
    'name':['Kim Jisoo', 'Robert Downey', 'Johny Depp', 'Harry Maguire', 'Ana De Armas'],
    'gender':['F','M','M','M','F'],
    'hire_date':['2018-05-01', '2017-08-01', '2018-11-01', '2019-02-01', '2017-03-01'],
    'salary':[7000, 9000, 8000, 4000, 9000]
}

# buat list berisi nomor karyawan untuk indexnya
emp_number = np.arange(1001,1006)

# membuat dataframe dan disimpan dalam varieble df
df = pd.DataFrame(dict_salary, index=emp_number)
df

Unnamed: 0,name,gender,hire_date,salary
1001,Kim Jisoo,F,2018-05-01,7000
1002,Robert Downey,M,2017-08-01,9000
1003,Johny Depp,M,2018-11-01,8000
1004,Harry Maguire,M,2019-02-01,4000
1005,Ana De Armas,F,2017-03-01,9000


## Adding New Data

### Add a New Row

In [28]:
df.loc[1006] = ['Kim Namjoon', 'M', '2019-01-01', 10000]
df

Unnamed: 0,name,gender,hire_date,salary
1001,Kim Jisoo,F,2018-05-01,7000
1002,Robert Downey,M,2017-08-01,9000
1003,Johny Depp,M,2018-11-01,8000
1004,Harry Maguire,M,2019-02-01,4000
1005,Ana De Armas,F,2017-03-01,9000
1006,Kim Namjoon,M,2019-01-01,10000


### Add a New Column

In [29]:
df["take_home_pay"] = df["salary"] * 0.8
df

Unnamed: 0,name,gender,hire_date,salary,take_home_pay
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0


In [30]:
emp_number = np.arange(1001,1007)
emp_number

array([1001, 1002, 1003, 1004, 1005, 1006])

In [31]:
df.insert(0, "employee_id", emp_number)
df

Unnamed: 0,employee_id,name,gender,hire_date,salary,take_home_pay
1001,1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,1006,Kim Namjoon,M,2019-01-01,10000,8000.0


In [None]:
# menambahkan kolom baru pada urutan tertentu
list_umur = [23,24,25,26,27,28]

# df.insert(index, nama_kolom, values) --> defaultnya perubahnnya permanen
# tidak perlu disimpan dalam variable atau menggunakan inplace=True
df.insert(3, 'umur', list_umur)  

## Deleting Data

### Delete Column

In [32]:
df.drop("employee_id", axis=1)

Unnamed: 0,name,gender,hire_date,salary,take_home_pay
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0


In [33]:
df

Unnamed: 0,employee_id,name,gender,hire_date,salary,take_home_pay
1001,1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,1006,Kim Namjoon,M,2019-01-01,10000,8000.0


In [34]:
df.drop("employee_id", axis=1, inplace=True)

In [35]:
df

Unnamed: 0,name,gender,hire_date,salary,take_home_pay
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0


### Delete Row

In [36]:
df.drop(1006, axis=0)

Unnamed: 0,name,gender,hire_date,salary,take_home_pay
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0


## Index

In [37]:
# menampilkan label index
df.index 

Index([1001, 1002, 1003, 1004, 1005, 1006], dtype='int64')

In [38]:
# menampilkan nama kolom
df.columns

Index(['name', 'gender', 'hire_date', 'salary', 'take_home_pay'], dtype='object')

In [None]:
list(df)

['name', 'gender', 'hire_date', 'salary', 'take_home_pay']

In [23]:
# ambil nama kolom paling depan
df.columns[0]

'name'

In [24]:
list(df)[0] 

'name'

In [39]:
# mengembalikan label index dari 0 dan seterusnya 
df.reset_index()

# label index sebelumnya (1001-1005) akan menjadi kolom tersendiri
# perubahannya tidak permanen

Unnamed: 0,index,name,gender,hire_date,salary,take_home_pay
0,1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1,1002,Robert Downey,M,2017-08-01,9000,7200.0
2,1003,Johny Depp,M,2018-11-01,8000,6400.0
3,1004,Harry Maguire,M,2019-02-01,4000,3200.0
4,1005,Ana De Armas,F,2017-03-01,9000,7200.0
5,1006,Kim Namjoon,M,2019-01-01,10000,8000.0


In [40]:
# pakai parameneter inplace=True
# tidak perlu disimpan dalam variable
df.reset_index(inplace=True)

In [42]:
df.set_index("index", inplace=True)

In [43]:
df

Unnamed: 0_level_0,name,gender,hire_date,salary,take_home_pay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0


## Multi Index

index bertingkat

In [44]:
# untuk label index baris
outside = ['Jakarta','Jakarta','Jakarta','Bandung','Bandung','Bandung']
inside = [1,2,3,1,2,3]

# untuk nama kolom
nama_kolom = ['TokoA', 'TokoB']

# untuk isi dataframe 6 baris 2 kolom
np.random.seed(0) 
isi = np.random.randint(low=1, high=100, size=(6,2))
isi

array([[45, 48],
       [65, 68],
       [68, 10],
       [84, 22],
       [37, 88],
       [71, 89]])

In [45]:
# buat dataframe
df_toko = pd.DataFrame(data=isi, columns=nama_kolom)
df_toko 

Unnamed: 0,TokoA,TokoB
0,45,48
1,65,68
2,68,10
3,84,22
4,37,88
5,71,89


In [46]:
# memasangkan item pada index yg sama dari 2 list
hier_index = list(zip(outside, inside))
hier_index 

[('Jakarta', 1),
 ('Jakarta', 2),
 ('Jakarta', 3),
 ('Bandung', 1),
 ('Bandung', 2),
 ('Bandung', 3)]

In [47]:
# mengubah tuples menjadi multiindex
multi_index = pd.MultiIndex.from_tuples(hier_index) 
multi_index 

MultiIndex([('Jakarta', 1),
            ('Jakarta', 2),
            ('Jakarta', 3),
            ('Bandung', 1),
            ('Bandung', 2),
            ('Bandung', 3)],
           )

In [48]:
# mengubah label index menjadi multiindex yang telah kita buat

# cara 1
df_toko.set_index(multi_index)

Unnamed: 0,Unnamed: 1,TokoA,TokoB
Jakarta,1,45,48
Jakarta,2,65,68
Jakarta,3,68,10
Bandung,1,84,22
Bandung,2,37,88
Bandung,3,71,89


In [49]:
# cara 2
df_toko.index = multi_index
df_toko 

Unnamed: 0,Unnamed: 1,TokoA,TokoB
Jakarta,1,45,48
Jakarta,2,65,68
Jakarta,3,68,10
Bandung,1,84,22
Bandung,2,37,88
Bandung,3,71,89


In [50]:
df_toko.index

MultiIndex([('Jakarta', 1),
            ('Jakarta', 2),
            ('Jakarta', 3),
            ('Bandung', 1),
            ('Bandung', 2),
            ('Bandung', 3)],
           )

In [51]:
df_toko.index.names

FrozenList([None, None])

In [52]:
# memberi header dari label index
df_toko.index.names = ['Kota', 'Cabang']
df_toko 

Unnamed: 0_level_0,Unnamed: 1_level_0,TokoA,TokoB
Kota,Cabang,Unnamed: 2_level_1,Unnamed: 3_level_1
Jakarta,1,45,48
Jakarta,2,65,68
Jakarta,3,68,10
Bandung,1,84,22
Bandung,2,37,88
Bandung,3,71,89


In [53]:
# ambil baris 'Jakarta'
df_toko.loc['Jakarta']

Unnamed: 0_level_0,TokoA,TokoB
Cabang,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45,48
2,65,68
3,68,10


In [54]:
# ambil baris Kota 'Jakarta' Cabang 2
df_toko.loc['Jakarta'].loc[[2]] 

Unnamed: 0_level_0,TokoA,TokoB
Cabang,Unnamed: 1_level_1,Unnamed: 2_level_1
2,65,68


In [57]:
# ambil baris Cabang 2 dari tiap Kota
df_toko.xs(level='Cabang', key=2, axis=0)

# axis=0 --> baris

Unnamed: 0_level_0,TokoA,TokoB
Kota,Unnamed: 1_level_1,Unnamed: 2_level_1
Jakarta,65,68
Bandung,37,88


In [58]:
df_baru = df_toko.transpose()
df_baru 
# Multi Columns

Kota,Jakarta,Jakarta,Jakarta,Bandung,Bandung,Bandung
Cabang,1,2,3,1,2,3
TokoA,45,65,68,84,37,71
TokoB,48,68,10,22,88,89


In [59]:
df_baru.xs(level='Cabang', key=2, axis=1) 
# axis=1 --> kolom

Kota,Jakarta,Bandung
TokoA,65,37
TokoB,68,88


## Sort Values

Mengurutkan data

In [60]:
df

Unnamed: 0_level_0,name,gender,hire_date,salary,take_home_pay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0


In [61]:
# mengurutkan datarframe berdasarkan 'name'
# defaultnya secara ascending (ascending=True)
df.sort_values(by='name')

Unnamed: 0_level_0,name,gender,hire_date,salary,take_home_pay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0
1002,Robert Downey,M,2017-08-01,9000,7200.0


In [62]:
# mengurutkan datarframe berdasarkan 'salary' dari yg paling besar (descending)
df.sort_values(by='salary', ascending=False) 

Unnamed: 0_level_0,name,gender,hire_date,salary,take_home_pay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1006,Kim Namjoon,M,2019-01-01,10000,8000.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0


In [63]:
# mengurutkan datarframe berdasarkan gender 'F'->'M' 
# kemudian salary dari yg paling besar (descending)

df.sort_values(by=['gender','salary'], ascending=[True, False]) 

Unnamed: 0_level_0,name,gender,hire_date,salary,take_home_pay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
1006,Kim Namjoon,M,2019-01-01,10000,8000.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0


In [64]:
# mengurutkan dataframe menggunakan label indexnya
df.sort_index(ascending=False) 

Unnamed: 0_level_0,name,gender,hire_date,salary,take_home_pay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1006,Kim Namjoon,M,2019-01-01,10000,8000.0
1005,Ana De Armas,F,2017-03-01,9000,7200.0
1004,Harry Maguire,M,2019-02-01,4000,3200.0
1003,Johny Depp,M,2018-11-01,8000,6400.0
1002,Robert Downey,M,2017-08-01,9000,7200.0
1001,Kim Jisoo,F,2018-05-01,7000,5600.0
