# Belajar pandas

# #01: Menyertakan prefix dan suffix pada kolom data frame

In [1]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.3.3
1.21.2


In [2]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10, size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,4,8,5,6,3
1,9,4,2,9,1
2,4,9,3,5,7
3,6,2,6,4,1
4,1,3,5,5,9


In [3]:
tuple('ABCDE')

('A', 'B', 'C', 'D', 'E')

In [4]:
# Menyertakan prefix kolom
df.add_prefix('kolom_')

Unnamed: 0,kolom_A,kolom_B,kolom_C,kolom_D,kolom_E
0,4,8,5,6,3
1,9,4,2,9,1
2,4,9,3,5,7
3,6,2,6,4,1
4,1,3,5,5,9


In [5]:
#Menyertakan sufix kolom
df.add_suffix('_field')

Unnamed: 0,A_field,B_field,C_field,D_field,E_field
0,4,8,5,6,3
1,9,4,2,9,1
2,4,9,3,5,7
3,6,2,6,4,1
4,1,3,5,5,9


# #02: Pemilihan baris (rows selection) pada data frame

In [6]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,5, size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,1,4,3,3,3
1,3,3,1,2,1
2,4,4,1,3,2
3,4,3,4,3,4
4,3,4,4,1,3
5,4,2,1,4,2
6,2,4,1,4,3
7,1,2,3,4,1
8,4,1,1,4,4
9,3,2,4,3,2


In [7]:
# Selection dengan operator logika | (or)
df[(df['A'] == 1) | (df['A'] == 3)]

Unnamed: 0,A,B,C,D,E
0,1,4,3,3,3
1,3,3,1,2,1
4,3,4,4,1,3
7,1,2,3,4,1
9,3,2,4,3,2


In [8]:
# Selection dengan fungsi isin()
df[df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
0,1,4,3,3,3
1,3,3,1,2,1
4,3,4,4,1,3
7,1,2,3,4,1
9,3,2,4,3,2


In [9]:
# Mengenal operator negasi ~
df[~df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
2,4,4,1,3,2
3,4,3,4,3,4
5,4,2,1,4,2
6,2,4,1,4,3
8,4,1,1,4,4


# #03: Konversi tipe data string ke numerik pada kolom data frame

In [10]:
# Persiapan data frame
data = {'col1':['1','2','3','teks'],
        'col2':['1','2','3','4']}

df=pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [11]:
df.dtypes

col1    object
col2    object
dtype: object

In [12]:
# konversi tipe data dengan fungsi astype()
df_x = df.astype({'col2':'int64'})
df_x

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [13]:
df_x.dtypes

col1    object
col2     int64
dtype: object

In [14]:
# Konversi tipe data numerik dengan fungsi to_numeric()
df.apply(pd.to_numeric, errors ='coerce')

Unnamed: 0,col1,col2
0,1.0,1
1,2.0,2
2,3.0,3
3,,4


# #04: Pemiliihan kolom (columns selection) pada pandas data frame berdasarkan tipe data

In [15]:
# Persiapan data frame
n_rows = 5
n_cols = 2
cols = ['bil_pecahan','bil_bulat']

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df['bil_pecahan']=df['bil_pecahan'].astype('float')

df.index=pd.util.testing.makeDateIndex(n_rows, freq='H')
df=df.reset_index()

df['teks']=list('ABCDE')

df

  import pandas.util.testing


Unnamed: 0,index,bil_pecahan,bil_bulat,teks
0,2000-01-01 00:00:00,11.0,1,A
1,2000-01-01 01:00:00,11.0,3,B
2,2000-01-01 02:00:00,18.0,10,C
3,2000-01-01 03:00:00,16.0,17,D
4,2000-01-01 04:00:00,8.0,19,E


In [16]:
df.dtypes

index          datetime64[ns]
bil_pecahan           float64
bil_bulat               int32
teks                   object
dtype: object

In [17]:
# memilih kolom bertipe data numerik
df.select_dtypes(include='number')

Unnamed: 0,bil_pecahan,bil_bulat
0,11.0,1
1,11.0,3
2,18.0,10
3,16.0,17
4,8.0,19


In [18]:
df.select_dtypes(include='float')

Unnamed: 0,bil_pecahan
0,11.0
1,11.0
2,18.0
3,16.0
4,8.0


In [19]:
df.select_dtypes(include='int')

Unnamed: 0,bil_bulat
0,1
1,3
2,10
3,17
4,19


In [20]:
# Memilih kolom bertipe data string atau object
df.select_dtypes(include='object')

Unnamed: 0,teks
0,A
1,B
2,C
3,D
4,E


In [21]:
# Memilih kolom bertipe data datetime
df.select_dtypes(include='datetime')

Unnamed: 0,index
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00
4,2000-01-01 04:00:00


In [22]:
# Memiliih kolom dengan kombinasi tipe data
df.select_dtypes(include=['number','object'])

Unnamed: 0,bil_pecahan,bil_bulat,teks
0,11.0,1,A
1,11.0,3,B
2,18.0,10,C
3,16.0,17,D
4,8.0,19,E


# #05: Membalik urutan baris dan kolom pada data frame

In [23]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,3,1,1,2,4
1,5,3,5,2,3
2,9,8,1,6,1
3,7,3,8,8,3
4,2,1,6,5,1


In [24]:
# Membalik urutan kolom
df.loc[:, ::-1]

Unnamed: 0,E,D,C,B,A
0,4,2,1,1,3
1,3,2,5,3,5
2,1,6,1,8,9
3,3,8,8,3,7
4,1,5,6,1,2


In [25]:
# Membalik urutan baris
df.loc[::-1]

Unnamed: 0,A,B,C,D,E
4,2,1,6,5,1
3,7,3,8,8,3
2,9,8,1,6,1
1,5,3,5,2,3
0,3,1,1,2,4


In [26]:
# Membalik urutan baris dan melakukan penyesuaian ulang index
df.loc[::-1].reset_index(drop=True)

Unnamed: 0,A,B,C,D,E
0,2,1,6,5,1
1,7,3,8,8,3
2,9,8,1,6,1
3,5,3,5,2,3
4,3,1,1,2,4


# #06: Mengganti nama (label) kolom pada data frame

In [27]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,5,1,2,8,5
1,9,2,9,5,4
2,9,6,1,4,7
3,7,9,8,8,2
4,7,4,2,7,3


In [28]:
# Mengganti nama (label) untuk sebuah kolom pada data frame
df.rename(columns={'C' : 'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,5,1,2,8,5
1,9,2,9,5,4
2,9,6,1,4,7
3,7,9,8,8,2
4,7,4,2,7,3


In [29]:
# Mengganti nama (label) untuk banyak kolom pada data frame
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,5,1,2,8,5
1,9,2,9,5,4
2,9,6,1,4,7
3,7,9,8,8,2
4,7,4,2,7,3


# #07: Menghapus missing values pada data frame (NaN)

In [30]:
# Persiapan data frame
df = pd.util.testing.makeMissingDataframe().reset_index() #formula untuk bikin dummy data dengan missing values
df.head()

Unnamed: 0,index,A,B,C,D
0,PwT8MzaALm,0.43193,,-0.952331,1.61212
1,nxTYQklYOf,2.107202,-0.435176,-0.275236,1.668938
2,IDev9eqxgh,-0.28594,-0.610829,-1.258713,-1.296162
3,W28Qeeq7EU,-0.912075,-1.29583,-2.355698,0.036775
4,18YrqlnzKe,-0.62723,-0.223618,0.352921,2.178866


In [31]:
df = df.rename(columns={'index' : 'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,PwT8MzaALm,0.43193,,-0.952331,1.61212
1,nxTYQklYOf,2.107202,-0.435176,-0.275236,1.668938
2,IDev9eqxgh,-0.28594,-0.610829,-1.258713,-1.296162
3,W28Qeeq7EU,-0.912075,-1.29583,-2.355698,0.036775
4,18YrqlnzKe,-0.62723,-0.223618,0.352921,2.178866


In [32]:
df_backup = df.copy(deep=True)

In [33]:
# Menghapus (drop) setiap kolom yang mengandung missing values
df = df.dropna(axis='columns')
df.head()

Unnamed: 0,Z
0,PwT8MzaALm
1,nxTYQklYOf
2,IDev9eqxgh
3,W28Qeeq7EU
4,18YrqlnzKe


In [34]:
# Menghapus (drop) setiap baris yang mengandung missing values
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
1,nxTYQklYOf,2.107202,-0.435176,-0.275236,1.668938
2,IDev9eqxgh,-0.28594,-0.610829,-1.258713,-1.296162
3,W28Qeeq7EU,-0.912075,-1.29583,-2.355698,0.036775
4,18YrqlnzKe,-0.62723,-0.223618,0.352921,2.178866
5,xrXdOdfikm,0.683146,1.703446,-2.612294,0.763559


In [35]:
# Persentase missing values untuk tiap kolom
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.066667
B    0.166667
C    0.133333
D    0.033333
dtype: float64

In [36]:
# Menghapus (drop) setiap kolom yang mengandung missing values berdasarkan treshold / ambang batas (presentase missing value)
treshold = len(df)*0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,D
0,PwT8MzaALm,0.43193,1.61212
1,nxTYQklYOf,2.107202,1.668938
2,IDev9eqxgh,-0.28594,-1.296162
3,W28Qeeq7EU,-0.912075,0.036775
4,18YrqlnzKe,-0.62723,2.178866


# #08: Memeriksa kesamaan antar kolom (series) pada data frame

In [37]:
# Persiapan data frame
data  = {'A':[15, 15, 18, np.nan, 12],'B':[15, 15, 18, np.nan, 12]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,15.0,15.0
1,15.0,15.0
2,18.0,18.0
3,,
4,12.0,12.0


In [38]:
# Mengenal pandas series
# data frame merupakan kumpulan dari data series
df['A']

0    15.0
1    15.0
2    18.0
3     NaN
4    12.0
Name: A, dtype: float64

In [39]:
type(df['A'])

pandas.core.series.Series

In [40]:
type(df)

pandas.core.frame.DataFrame

In [41]:
# Memeriksa kesamaan dengan operator == (Tidak di rekomendasikan karena apabila dalam data teradap NaN maka akan tidak terbaca dengan baik)
df['A'] == df['B']

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [42]:
# Memeriksa kesamaan dengan method equals() | Lebih di rekomendasikan
df['A'].equals(df['B'])

True

In [43]:
# Memeriksa kesamaan antar dua data frame
df1 = df.copy(deep=True)
df.equals(df1)

True

In [44]:
df == df1

Unnamed: 0,A,B
0,True,True
1,True,True
2,True,True
3,False,False
4,True,True


# #09: Membagi data frame menjadi dua secara acak

In [45]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,19,17,11,6,8
1,17,2,1,19,5
2,11,5,11,16,6
3,3,4,5,3,1
4,6,17,17,5,6
5,17,4,6,13,4
6,15,4,13,17,18
7,10,3,10,11,2
8,10,2,1,10,5
9,10,8,2,16,7


In [46]:
# Membagi dua data frame menjadi dua secara acak berdasarkan proporsi tertentu
df.shape

(10, 5)

In [47]:
proporsi = 0.7
df_1 = df.sample(frac=proporsi)
df_2 = df.drop(df_1.index)

print(f'df_1 Shape: {df_1.shape}')
print(f'df_2 Shape: {df_2.shape}')

df_1 Shape: (7, 5)
df_2 Shape: (3, 5)


In [48]:
df_1

Unnamed: 0,A,B,C,D,E
4,6,17,17,5,6
8,10,2,1,10,5
0,19,17,11,6,8
9,10,8,2,16,7
3,3,4,5,3,1
2,11,5,11,16,6
5,17,4,6,13,4


In [49]:
df_2

Unnamed: 0,A,B,C,D,E
1,17,2,1,19,5
6,15,4,13,17,18
7,10,3,10,11,2


# #10: Mengganti nama (label) kolom pada data frame berdasarkan pola


In [50]:
# Persiapan data frame
df = pd.read_csv('./Dataset/titanicfull.csv')
df.columns = ['Pclass','Survival status','full Name','Sex ',' Age','Sib SP', 'parch','Ticket','Fare','Cabin','Embarked']
df_backup = df.copy(deep=True)
df.head()

Unnamed: 0,Pclass,Survival status,full Name,Sex,Age,Sib SP,parch,Ticket,Fare,Cabin,Embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [51]:
# Menggunakan lowercase untuk nama kolom dan mengganti spasi dengan _
df.columns = df.columns.str.replace(' ','_').str.lower()
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex_,_age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [52]:
# Memangkas kelebihan spasi pada nama kolom
df = df_backup.copy(deep=True)
df.columns = df.columns.str.lower().str.strip().str.replace(' ','_') 
# Remark:
# str.lower() = merubah menjadi huruf kecil
# str.strip() = menghilangkan kelebihan sepasi di awal ataupun diakhir kata
# str.replace() = mengganti huruf
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex,age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


# #11: Seleksi kolom dan baris pada data frame menggunakan loc

In [53]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,12,6,13,11,11
1,18,14,3,8,12
2,3,5,16,3,10
3,14,19,3,8,18
4,16,8,2,10,5
5,14,3,1,10,4
6,14,14,7,9,18
7,17,5,18,15,6
8,1,7,1,2,16
9,15,19,1,3,17


In [54]:
# Seleksi kolom dan baris menggunakan loc
df.loc[[0,3,4],['B','E']] #bagian pertama untuk baris, bagian kedua untuk kolom


Unnamed: 0,B,E
0,6,11
3,19,18
4,8,5


In [55]:
# Seleksi baris dengan kondisi
df.loc[df['B']>10,['B','D','E']]

Unnamed: 0,B,D,E
1,14,8,12
3,19,8,18
6,14,9,18
9,19,3,17


In [56]:
# Slicing data frame dengan loc
df.loc[0:4, 'B':'D'] #slicing pada loc dataframe start dan End index sifatnya inclusif artinya angka tersebut di turut sertakan | Slicing list hanya start index say inclusive

Unnamed: 0,B,C,D
0,6,13,11
1,14,3,8
2,5,16,3
3,19,3,8
4,8,2,10


# #12: Membentuk kolom bertipe datetime dari beberapa kolom lain pada pandas data frame

In [57]:
# Persiapan data frame
data = {'day':[1,2,10,25,12], 'month':[1,2,4,5,6], 'year':[2000,2001,2010,2015,2020]} #data dictionary
df = pd.DataFrame(data)
df

Unnamed: 0,day,month,year
0,1,1,2000
1,2,2,2001
2,10,4,2010
3,25,5,2015
4,12,6,2020


In [58]:
# Membentuk kolom bertipe datetime
df['penanggalan'] = pd.to_datetime(df[['day','month','year']])
df

Unnamed: 0,day,month,year,penanggalan
0,1,1,2000,2000-01-01
1,2,2,2001,2001-02-02
2,10,4,2010,2010-04-10
3,25,5,2015,2015-05-25
4,12,6,2020,2020-06-12


In [59]:
df.dtypes

day                     int64
month                   int64
year                    int64
penanggalan    datetime64[ns]
dtype: object

# #13: Konversi nilai numerik ke dalam kategori pada data frame

In [60]:
# Persiapan data frame
n_rows = 10
n_cols = 1
cols = ('usia',)

df = pd.DataFrame(np.random.randint(1,99, size=(n_rows,n_cols)),columns=cols)
df

Unnamed: 0,usia
0,4
1,4
2,54
3,73
4,51
5,78
6,89
7,35
8,3
9,5


In [61]:
# Pengelompokan nilai numerik ke dalam beberapa kategori menggunakan cut()
df['kelompok_usia'] = pd.cut(df['usia'], bins=[0,18,65,99], labels=['anak','dewasa','manula'])
df

Unnamed: 0,usia,kelompok_usia
0,4,anak
1,4,anak
2,54,dewasa
3,73,manula
4,51,dewasa
5,78,manula
6,89,manula
7,35,dewasa
8,3,anak
9,5,anak


# #14: Menggabungkan (merge) dua pandas data frame

In [62]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows,n_cols)),columns=cols)
df.head()

Unnamed: 0,A,B,C,D,E
0,3,1,19,19,16
1,18,5,7,14,11
2,3,14,16,8,13
3,2,12,8,5,13
4,3,11,1,14,7


In [63]:
df1 = df.copy(deep=True)
df1 = df1.drop([1,4])
df1

Unnamed: 0,A,B,C,D,E
0,3,1,19,19,16
2,3,14,16,8,13
3,2,12,8,5,13


In [64]:
df2 = df.copy(deep=True)
df2 = df2.drop([0,3])
df2

Unnamed: 0,A,B,C,D,E
1,18,5,7,14,11
2,3,14,16,8,13
4,3,11,1,14,7


In [65]:
# MEnggabungkan dua data frame
df_inner = pd.merge(df1, df2, how='inner')
df_inner #Menghasilkan index baru

Unnamed: 0,A,B,C,D,E
0,3,14,16,8,13


In [66]:
df_outer = pd.merge(df1, df2, how='outer')
df_outer # nilai duplikasi akan di hilangkan, mostly used in actual case

Unnamed: 0,A,B,C,D,E
0,3,1,19,19,16
1,3,14,16,8,13
2,2,12,8,5,13
3,18,5,7,14,11
4,3,11,1,14,7


# #15: Memecah nilai string suatu kolom ke dalam beberapa kolom baru pada pandas data frame

In [67]:
# Persiapan data frame
data = {'nama':['Didi Kempot', 'Glen Fredly','Mbah Surip'], 'tempat_kelahiran':['Surakarta, jawa Tengah','Jakarta, DKI Jakarta','Mojokerto, Jawa Timur']}
df = pd.DataFrame(data)
df

Unnamed: 0,nama,tempat_kelahiran
0,Didi Kempot,"Surakarta, jawa Tengah"
1,Glen Fredly,"Jakarta, DKI Jakarta"
2,Mbah Surip,"Mojokerto, Jawa Timur"


In [68]:
# Memecah nama depan dan nama belakang
df[['nama_depan','nama_belakang']] = df['nama'].str.split(' ', expand=True)
df

Unnamed: 0,nama,tempat_kelahiran,nama_depan,nama_belakang
0,Didi Kempot,"Surakarta, jawa Tengah",Didi,Kempot
1,Glen Fredly,"Jakarta, DKI Jakarta",Glen,Fredly
2,Mbah Surip,"Mojokerto, Jawa Timur",Mbah,Surip


In [69]:
# Memecah nama kota dan propinsi
df[['kota','propinsi']] = df['tempat_kelahiran'].str.split(',', expand=True)
df

Unnamed: 0,nama,tempat_kelahiran,nama_depan,nama_belakang,kota,propinsi
0,Didi Kempot,"Surakarta, jawa Tengah",Didi,Kempot,Surakarta,jawa Tengah
1,Glen Fredly,"Jakarta, DKI Jakarta",Glen,Fredly,Jakarta,DKI Jakarta
2,Mbah Surip,"Mojokerto, Jawa Timur",Mbah,Surip,Mojokerto,Jawa Timur


# #16: Menata ulang data frame dengan multiple indexes menggunakan unstack()

In [70]:
# Persiapan data frame
df = pd.read_csv('./Dataset/titanicfull.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [71]:
# Data frame dengan multiple indexes dari hasil gouping
df.groupby(['sex','pclass'])['survived'].mean().to_frame() #to_frame() digunakan untuk mempercantik tampilan dalam hal ini tabel

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
female,1,0.965278
female,2,0.886792
female,3,0.490741
male,1,0.340782
male,2,0.146199
male,3,0.15213


In [72]:
# Menata ulang data frame dengan multiple indexes
df.groupby(['sex','pclass'])['survived'].mean().unstack()

pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.965278,0.886792,0.490741
male,0.340782,0.146199,0.15213


# #17: Resampling pada data deret waktu (time series data)

In [73]:
# Persiapan data frame
n_rows = 365*24 #365 = 365 hari atau 1 tahun | 24 = 24 jam atau 1 hari
n_cols = 2
cols = ['col1','col2']

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows,n_cols)), columns=cols)
df.index = pd.util.testing.makeDateIndex(n_rows, freq='H')  #deret waktu dalam jam
df

Unnamed: 0,col1,col2
2000-01-01 00:00:00,14,19
2000-01-01 01:00:00,11,12
2000-01-01 02:00:00,14,15
2000-01-01 03:00:00,14,9
2000-01-01 04:00:00,7,8
...,...,...
2000-12-30 19:00:00,3,2
2000-12-30 20:00:00,5,14
2000-12-30 21:00:00,5,14
2000-12-30 22:00:00,19,15


In [74]:
# Resampling data dengan interval monthly
df.resample('M')['col1'].sum().to_frame()

Unnamed: 0,col1
2000-01-31,7383
2000-02-29,6820
2000-03-31,7414
2000-04-30,7355
2000-05-31,7563
2000-06-30,7098
2000-07-31,7635
2000-08-31,7497
2000-09-30,6953
2000-10-31,7568


In [75]:
# Resampling data dengan interval daily
df.resample('D')['col1'].sum().to_frame()

Unnamed: 0,col1
2000-01-01,247
2000-01-02,253
2000-01-03,226
2000-01-04,245
2000-01-05,242
...,...
2000-12-26,259
2000-12-27,192
2000-12-28,242
2000-12-29,245


# #18: Membentuk dummy data frame

In [76]:
# Membentuk data frame dari dictionary
pd.DataFrame({'col1':[1,2,3,4],'col2':[5,6,7,8]})

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


In [77]:
# Membentuk data frame dari numpy array
n_rows = 5
n_cols = 3

arr = np.random.randint(1,20,size=(n_rows,n_cols))
arr

array([[10, 10, 15],
       [16, 13,  7],
       [11, 12, 11],
       [ 3, 17, 11],
       [ 9, 17,  7]])

In [78]:
pd.DataFrame(arr,columns=tuple('ABC'))

Unnamed: 0,A,B,C
0,10,10,15
1,16,13,7
2,11,12,11
3,3,17,11
4,9,17,7


In [79]:
# Membentuk data frame dengan memanfaatkan pandas.util.testing
pd.util.testing.makeDataFrame().head()

Unnamed: 0,A,B,C,D
5CyexhKihe,0.581362,-0.460004,1.125214,0.884705
jcZqJCeofP,-1.115773,1.302396,-0.476227,0.608633
Y0OUkWAsiD,-0.027545,0.136528,-1.206168,0.33216
lEC0SOqMQ7,-0.349655,0.491612,-0.812306,-0.66354
zgSSv8Ilyc,-0.699464,-0.438707,-0.318839,-1.309258


In [80]:
pd.util.testing.makeMixedDataFrame().head()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [81]:
pd.util.testing.makeTimeDataFrame().head()

Unnamed: 0,A,B,C,D
2000-01-03,2.029836,-1.466003,1.500546,-0.689401
2000-01-04,0.439874,-1.132,-0.977011,-0.525251
2000-01-05,-1.165998,-1.46288,-0.552828,-0.164638
2000-01-06,-0.259166,-0.048714,0.631775,0.37321
2000-01-07,-0.635115,-1.243837,0.134522,-3.473849


In [82]:
pd.util.testing.makeMissingDataframe().head()

Unnamed: 0,A,B,C,D
AgZzM8p9tB,-0.385547,-0.123686,0.209135,
tGEMjvm8S1,-0.500325,-0.535875,1.558429,-0.136733
Dzx20KTtZq,-1.99597,0.320194,-0.667617,2.296852
P79i4NEkwQ,-0.582263,0.660437,-1.102396,0.730409
0sou0UYCWX,-0.691797,-0.145759,-0.098467,


# #19: Formating tampilan data frame

In [83]:
# Persiapan data frame
n_rows = 5
n_cols = 2
cols = ['omset','operational']

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows,n_cols)), columns=cols)
df

Unnamed: 0,omset,operational
0,5,17
1,7,2
2,1,2
3,10,5
4,9,9


In [84]:
df['omset'] = df['omset']*100_000
df['operational'] = df['operational']*10_000
df

Unnamed: 0,omset,operational
0,500000,170000
1,700000,20000
2,100000,20000
3,1000000,50000
4,900000,90000


In [85]:
df.index = pd.util.testing.makeDateIndex(n_rows, freq='D')
df = df.reset_index()
df = df.rename(columns={'index':'tanggal'})
df

Unnamed: 0,tanggal,omset,operational
0,2000-01-01,500000,170000
1,2000-01-02,700000,20000
2,2000-01-03,100000,20000
3,2000-01-04,1000000,50000
4,2000-01-05,900000,90000


In [86]:
# Melakukan formatting tampilan data frame
formatku = {'tanggal':'{:%d/%m/%y}','operational':'Rp. {:.2f}','omset':'Rp. {:.2f}'}
laporan = df.style.format(formatku)
laporan

Unnamed: 0,tanggal,omset,operational
0,01/01/00,Rp. 500000.00,Rp. 170000.00
1,02/01/00,Rp. 700000.00,Rp. 20000.00
2,03/01/00,Rp. 100000.00,Rp. 20000.00
3,04/01/00,Rp. 1000000.00,Rp. 50000.00
4,05/01/00,Rp. 900000.00,Rp. 90000.00


In [87]:
type(laporan) #type laporn bukan object dataframe tapi styler


pandas.io.formats.style.Styler

In [88]:
laporan.hide_index()

tanggal,omset,operational
01/01/00,Rp. 500000.00,Rp. 170000.00
02/01/00,Rp. 700000.00,Rp. 20000.00
03/01/00,Rp. 100000.00,Rp. 20000.00
04/01/00,Rp. 1000000.00,Rp. 50000.00
05/01/00,Rp. 900000.00,Rp. 90000.00


In [89]:
laporan.set_caption('Data omset dan operational')

tanggal,omset,operational
01/01/00,Rp. 500000.00,Rp. 170000.00
02/01/00,Rp. 700000.00,Rp. 20000.00
03/01/00,Rp. 100000.00,Rp. 20000.00
04/01/00,Rp. 1000000.00,Rp. 50000.00
05/01/00,Rp. 900000.00,Rp. 90000.00


In [90]:
laporan.highlight_min('omset', color='pink')
laporan.highlight_max('omset', color='lightgreen')

laporan.highlight_min('operational', color='lightblue')
laporan.highlight_max('operational', color='grey')

tanggal,omset,operational
01/01/00,Rp. 500000.00,Rp. 170000.00
02/01/00,Rp. 700000.00,Rp. 20000.00
03/01/00,Rp. 100000.00,Rp. 20000.00
04/01/00,Rp. 1000000.00,Rp. 50000.00
05/01/00,Rp. 900000.00,Rp. 90000.00


# #20: Menggabungkan (merge) dua data frame secara berdampingan

In [91]:
# Persiapan data frame
d1 = {'col1':[1,2,3],'col2':[10,20,30]}
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,col1,col2
0,1,10
1,2,20
2,3,30


In [92]:
d2 = {'col3':[4,5,6],'col4':[40,50,60]}
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,col3,col4
0,4,40
1,5,50
2,6,60


In [93]:
# Menggabungkan (merge) dua data frame secara berdampingan
df = pd.merge(df1,df2,left_index=True,right_index=True)
df

Unnamed: 0,col1,col2,col3,col4
0,1,10,4,40
1,2,20,5,50
2,3,30,6,60


# #21: Agregasi pada pandas data frame dengan agg()

In [94]:
# Persiapan data frame
df = pd.read_csv('./Dataset/Iris.csv')
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [95]:
# Mengenal groupby() dan fungsi agregasi
df.groupby('Species')['PetalLengthCm'].count().to_frame()

Unnamed: 0_level_0,PetalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,50
Iris-versicolor,50
Iris-virginica,50


In [96]:
df.groupby('Species')['PetalLengthCm'].mean().to_frame()

Unnamed: 0_level_0,PetalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,1.464
Iris-versicolor,4.26
Iris-virginica,5.552


In [97]:
df.groupby('Species')['PetalLengthCm'].median().to_frame()

Unnamed: 0_level_0,PetalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,1.5
Iris-versicolor,4.35
Iris-virginica,5.55


In [98]:
# Agregasi dengan agg()
df.groupby('Species')['PetalLengthCm'].agg(['count','mean','median'])

Unnamed: 0_level_0,count,mean,median
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,50,1.464,1.5
Iris-versicolor,50,4.26,4.35
Iris-virginica,50,5.552,5.55


In [99]:
# Agregasi dengan describe
df.groupby('Species')['PetalLengthCm'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Iris-setosa,50.0,1.464,0.173511,1.0,1.4,1.5,1.575,1.9
Iris-versicolor,50.0,4.26,0.469911,3.0,4.0,4.35,4.6,5.1
Iris-virginica,50.0,5.552,0.551895,4.5,5.1,5.55,5.875,6.9


# #22: Memantau penggunaan memory pada data frame

In [100]:
# Persiapan data frame
df_titanic = pd.read_csv('./Dataset/titanicfull.csv')
df_titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [101]:
df_iris = pd.read_csv('./Dataset/Iris.csv')
df_iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [102]:
# Memantau penggunaan memory suatu data frame
df_titanic.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   name      1309 non-null   object 
 3   sex       1309 non-null   object 
 4   age       1046 non-null   float64
 5   sibsp     1309 non-null   int64  
 6   parch     1309 non-null   int64  
 7   ticket    1309 non-null   object 
 8   fare      1308 non-null   float64
 9   cabin     295 non-null    object 
 10  embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 452.7 KB


In [103]:
df_iris.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 16.3 KB


In [104]:
# Memantau penggunaan memory untuk setiap kolom dari suatu data frame
df_titanic.memory_usage(deep=True) #Satuan yang digunakan adalah byte

Index          128
pclass       10472
survived     10472
name        110127
sex          80781
age          10472
sibsp        10472
parch        10472
ticket       83502
fare         10472
cabin        50366
embarked     75870
dtype: int64

In [105]:
df_iris.memory_usage(deep=True)

Index              128
Id                1200
SepalLengthCm     1200
SepalWidthCm      1200
PetalLengthCm     1200
PetalWidthCm      1200
Species          10550
dtype: int64

# #23: Seleksi baris pada data frame dengan query()

In [106]:
# Persiapan data frame
d = {'kolom_satu': [1,2,3,4,5],'kolom dua': [10,20,30,40,50]}
df = pd.DataFrame(d)
df

Unnamed: 0,kolom_satu,kolom dua
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [107]:
# Seleksi baris dengan query()
df.query('kolom_satu > 2')

Unnamed: 0,kolom_satu,kolom dua
2,3,30
3,4,40
4,5,50


In [108]:
df.query('`kolom dua` > 30')

Unnamed: 0,kolom_satu,kolom dua
3,4,40
4,5,50


# #24: UTC dan konversi zona waktu (time zone) pada python pandas

In [109]:
# Persiapan series
#                   Start val,  End Val., Steper
s = pd.Series(range(1591683521,1592201921,3600))
s = pd.to_datetime(s, unit='s')
s.head()

0   2020-06-09 06:18:41
1   2020-06-09 07:18:41
2   2020-06-09 08:18:41
3   2020-06-09 09:18:41
4   2020-06-09 10:18:41
dtype: datetime64[ns]

## [Epoch/Unix Time](https://en.wikipedia.org/wiki/Unix_time)
## [Epoch Time Converter](https://www.epochconverter.com/)

In [110]:
# Pengaturan zona waktu (time zone)
s = s.dt.tz_localize('UTC')
s.head()

0   2020-06-09 06:18:41+00:00
1   2020-06-09 07:18:41+00:00
2   2020-06-09 08:18:41+00:00
3   2020-06-09 09:18:41+00:00
4   2020-06-09 10:18:41+00:00
dtype: datetime64[ns, UTC]

In [111]:
s = s.dt.tz_convert('Asia/Jakarta')
s.head()

0   2020-06-09 13:18:41+07:00
1   2020-06-09 14:18:41+07:00
2   2020-06-09 15:18:41+07:00
3   2020-06-09 16:18:41+07:00
4   2020-06-09 17:18:41+07:00
dtype: datetime64[ns, Asia/Jakarta]

In [112]:
s = s.dt.tz_convert('Australia/Hobart')
s.head()

0   2020-06-09 16:18:41+10:00
1   2020-06-09 17:18:41+10:00
2   2020-06-09 18:18:41+10:00
3   2020-06-09 19:18:41+10:00
4   2020-06-09 20:18:41+10:00
dtype: datetime64[ns, Australia/Hobart]

# #25: Pengaturan tampilan (display option) pada python pandas

In [113]:
# Persiapan data frame
df = pd.read_csv('./Dataset/titanicfull.csv')
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.00,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.00,1,2,113781,151.5500,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.00,1,2,113781,151.5500,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.00,1,2,113781,151.5500,C22 C26,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.50,1,0,2665,14.4542,,C
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.50,0,0,2656,7.2250,,C
1307,3,0,"Zakarian, Mr. Ortin",male,27.00,0,0,2670,7.2250,,C


In [114]:
# Pengaturan tampilan
pd.set_option('display.max_rows', 5) # menentukan jumlah baris maksimal
pd.set_option('display.max_columns', 6) # menentukan jumlah kolom maksimal
pd.set_option('display.max_colwidth', 20) # maksimal lebar kolom adalah 20 karakter

df

Unnamed: 0,pclass,survived,name,...,fare,cabin,embarked
0,1,1,"Allen, Miss. Eli...",...,211.3375,B5,S
1,1,1,"Allison, Master....",...,151.5500,C22 C26,S
...,...,...,...,...,...,...,...
1307,3,0,"Zakarian, Mr. Ortin",...,7.2250,,C
1308,3,0,"Zimmerman, Mr. Leo",...,7.8750,,S


In [115]:
pd.reset_option('^display.', silent=True) #reset and back to normal view
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.00,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.00,1,2,113781,151.5500,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.00,1,2,113781,151.5500,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.00,1,2,113781,151.5500,C22 C26,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.50,1,0,2665,14.4542,,C
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.50,0,0,2656,7.2250,,C
1307,3,0,"Zakarian, Mr. Ortin",male,27.00,0,0,2670,7.2250,,C


In [116]:
pd.describe_option() # untuk melihat pengaturan dalam tampilan pandas

compute.use_bottleneck : bool
    Use the bottleneck library to accelerate if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]
compute.use_numba : bool
    Use the numba engine option for select operations if it is installed,
    the default is False
    Valid values: False,True
    [default: False] [currently: False]
compute.use_numexpr : bool
    Use the numexpr library to accelerate computation if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]
display.chop_threshold : float or None
    if set to a float value, all float values smaller then the given threshold
    will be displayed as exactly 0 by repr and friends.
    [default: None] [currently: None]
display.colheader_justify : 'left'/'right'
    Controls the justification of column headers. used by DataFrameFormatter.
    [default: right] [currently: right]
display.column_space No description available.
    [defa

# #26: Membuat data frame dari hasil seleksi spreadsheet

In [131]:
# Membuat data frame dari hasil seleksi spreadsheet
# Hasil di buat menjadi format commet karena harus ada data yang disimpan kedalam clipboard
# df = pd.read_clipboard()
# df

# #27: Mengenal fungsi agregasi first() dan last()

In [132]:
# Persiapan data frame
d = {'dokter': ['Budi','Wati','Iwan','Budi','Budi','Wati'], 'pasien':['Abdul','Rahmat','Asep','Joko','Wiwin','Lisa']}
df = pd.DataFrame(d)
df

Unnamed: 0,dokter,pasien
0,Budi,Abdul
1,Wati,Rahmat
2,Iwan,Asep
3,Budi,Joko
4,Budi,Wiwin
5,Wati,Lisa


In [133]:
# Mengenal fungsi agregasi first() dan last()
df.groupby('dokter')['pasien'].count().to_frame()

Unnamed: 0_level_0,pasien
dokter,Unnamed: 1_level_1
Budi,3
Iwan,1
Wati,2


In [134]:
df.groupby('dokter')['pasien'].first().to_frame()

Unnamed: 0_level_0,pasien
dokter,Unnamed: 1_level_1
Budi,Abdul
Iwan,Asep
Wati,Rahmat


In [135]:
df.groupby('dokter')['pasien'].last().to_frame()

Unnamed: 0_level_0,pasien
dokter,Unnamed: 1_level_1
Budi,Wiwin
Iwan,Asep
Wati,Lisa


# #28: Mengenal explode and implode list pada data frame

In [136]:
# Persiapan data frame
d = {'Team':['DC','Marvel'], 'Heroes':[['Batman','Superman','Wonder Woman','Aquamen','Green Lantern','Shazam'],['Iron Man','Captain America','Ant-man','Black Panther','Captain Marvel']]}
df = pd.DataFrame(d)
df

Unnamed: 0,Team,Heroes
0,DC,"[Batman, Superman, Wonder Woman, Aquamen, Gree..."
1,Marvel,"[Iron Man, Captain America, Ant-man, Black Pan..."


In [137]:
# Explode ( memisahkan data list)
df1 = df.explode('Heroes')
df1

Unnamed: 0,Team,Heroes
0,DC,Batman
0,DC,Superman
0,DC,Wonder Woman
0,DC,Aquamen
0,DC,Green Lantern
0,DC,Shazam
1,Marvel,Iron Man
1,Marvel,Captain America
1,Marvel,Ant-man
1,Marvel,Black Panther


In [138]:
# Implode (menyatukan kembali ke dalam list)
d = {'Team':['DC','Marvel']}
df2 = pd.DataFrame(d)
df2

Unnamed: 0,Team
0,DC
1,Marvel


In [139]:
df2['Imploded'] = df1.groupby(df1.index)['Heroes'].agg(list)
df2

Unnamed: 0,Team,Imploded
0,DC,"[Batman, Superman, Wonder Woman, Aquamen, Gree..."
1,Marvel,"[Iron Man, Captain America, Ant-man, Black Pan..."


# #29: 