# Belajar pandas

## #01: Menyertakan prefix dan suffix pada kolom data frame

In [420]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.3.3
1.21.2


In [421]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10, size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,7,1,7,5,7
1,6,4,9,6,1
2,2,7,8,2,2
3,6,7,6,3,4
4,3,4,9,6,9


In [422]:
tuple('ABCDE')

('A', 'B', 'C', 'D', 'E')

In [423]:
# Menyertakan prefix kolom
df.add_prefix('kolom_')

Unnamed: 0,kolom_A,kolom_B,kolom_C,kolom_D,kolom_E
0,7,1,7,5,7
1,6,4,9,6,1
2,2,7,8,2,2
3,6,7,6,3,4
4,3,4,9,6,9


In [424]:
#Menyertakan sufix kolom
df.add_suffix('_field')

Unnamed: 0,A_field,B_field,C_field,D_field,E_field
0,7,1,7,5,7
1,6,4,9,6,1
2,2,7,8,2,2
3,6,7,6,3,4
4,3,4,9,6,9


## #02: Pemilihan baris (rows selection) pada data frame

In [425]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,5, size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,4,1,3,4,1
1,3,4,4,2,2
2,3,4,1,1,3
3,3,3,1,1,4
4,1,1,3,2,3
5,4,3,1,1,3
6,1,1,2,1,2
7,2,3,1,2,3
8,1,3,2,4,1
9,3,1,4,3,1


In [426]:
# Selection dengan operator logika | (or)
df[(df['A'] == 1) | (df['A'] == 3)]

Unnamed: 0,A,B,C,D,E
1,3,4,4,2,2
2,3,4,1,1,3
3,3,3,1,1,4
4,1,1,3,2,3
6,1,1,2,1,2
8,1,3,2,4,1
9,3,1,4,3,1


In [427]:
# Selection dengan fungsi isin()
df[df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
1,3,4,4,2,2
2,3,4,1,1,3
3,3,3,1,1,4
4,1,1,3,2,3
6,1,1,2,1,2
8,1,3,2,4,1
9,3,1,4,3,1


In [428]:
# Mengenal operator negasi ~
df[~df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
0,4,1,3,4,1
5,4,3,1,1,3
7,2,3,1,2,3


## #03: Konversi tipe data string ke numerik pada kolom data frame

In [429]:
# Persiapan data frame
data = {'col1':['1','2','3','teks'],
        'col2':['1','2','3','4']}

df=pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [430]:
df.dtypes

col1    object
col2    object
dtype: object

In [431]:
# konversi tipe data dengan fungsi astype()
df_x = df.astype({'col2':'int64'})
df_x

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [432]:
df_x.dtypes

col1    object
col2     int64
dtype: object

In [433]:
# Konversi tipe data numerik dengan fungsi to_numeric()
df.apply(pd.to_numeric, errors ='coerce')

Unnamed: 0,col1,col2
0,1.0,1
1,2.0,2
2,3.0,3
3,,4


## #04: Pemiliihan kolom (columns selection) pada pandas data frame berdasarkan tipe data

In [434]:
# Persiapan data frame
n_rows = 5
n_cols = 2
cols = ['bil_pecahan','bil_bulat']

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df['bil_pecahan']=df['bil_pecahan'].astype('float')

df.index=pd.util.testing.makeDateIndex(n_rows, freq='H')
df=df.reset_index()

df['teks']=list('ABCDE')

df

Unnamed: 0,index,bil_pecahan,bil_bulat,teks
0,2000-01-01 00:00:00,16.0,13,A
1,2000-01-01 01:00:00,5.0,9,B
2,2000-01-01 02:00:00,5.0,10,C
3,2000-01-01 03:00:00,11.0,9,D
4,2000-01-01 04:00:00,13.0,17,E


In [435]:
df.dtypes

index          datetime64[ns]
bil_pecahan           float64
bil_bulat               int32
teks                   object
dtype: object

In [436]:
# memilih kolom bertipe data numerik
df.select_dtypes(include='number')

Unnamed: 0,bil_pecahan,bil_bulat
0,16.0,13
1,5.0,9
2,5.0,10
3,11.0,9
4,13.0,17


In [437]:
df.select_dtypes(include='float')

Unnamed: 0,bil_pecahan
0,16.0
1,5.0
2,5.0
3,11.0
4,13.0


In [438]:
df.select_dtypes(include='int')

Unnamed: 0,bil_bulat
0,13
1,9
2,10
3,9
4,17


In [439]:
# Memilih kolom bertipe data string atau object
df.select_dtypes(include='object')

Unnamed: 0,teks
0,A
1,B
2,C
3,D
4,E


In [440]:
# Memilih kolom bertipe data datetime
df.select_dtypes(include='datetime')

Unnamed: 0,index
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00
4,2000-01-01 04:00:00


In [441]:
# Memiliih kolom dengan kombinasi tipe data
df.select_dtypes(include=['number','object'])

Unnamed: 0,bil_pecahan,bil_bulat,teks
0,16.0,13,A
1,5.0,9,B
2,5.0,10,C
3,11.0,9,D
4,13.0,17,E


## #05: Membalik urutan baris dan kolom pada data frame

In [442]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,2,5,3,9,1
1,5,8,7,2,7
2,8,9,1,1,6
3,6,9,2,5,7
4,1,8,1,2,7


In [443]:
# Membalik urutan kolom
df.loc[:, ::-1]

Unnamed: 0,E,D,C,B,A
0,1,9,3,5,2
1,7,2,7,8,5
2,6,1,1,9,8
3,7,5,2,9,6
4,7,2,1,8,1


In [444]:
# Membalik urutan baris
df.loc[::-1]

Unnamed: 0,A,B,C,D,E
4,1,8,1,2,7
3,6,9,2,5,7
2,8,9,1,1,6
1,5,8,7,2,7
0,2,5,3,9,1


In [445]:
# Membalik urutan baris dan melakukan penyesuaian ulang index
df.loc[::-1].reset_index(drop=True)

Unnamed: 0,A,B,C,D,E
0,1,8,1,2,7
1,6,9,2,5,7
2,8,9,1,1,6
3,5,8,7,2,7
4,2,5,3,9,1


## #06: Mengganti nama (label0 kolom pada data frame

In [446]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,9,9,4,5,9
1,4,7,2,5,8
2,6,6,3,6,2
3,9,8,5,8,8
4,6,8,5,5,8


In [447]:
# Mengganti nama (label) untuk sebuah kolom pada data frame
df.rename(columns={'C' : 'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,9,9,4,5,9
1,4,7,2,5,8
2,6,6,3,6,2
3,9,8,5,8,8
4,6,8,5,5,8


In [448]:
# Mengganti nama (label) untuk banyak kolom pada data frame
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,9,9,4,5,9
1,4,7,2,5,8
2,6,6,3,6,2
3,9,8,5,8,8
4,6,8,5,5,8


## #07: Menghapus missing values pada data frame (NaN)

In [449]:
# Persiapan data frame
df = pd.util.testing.makeMissingDataframe().reset_index() #formula untuk bikin dummy data dengan missing values
df.head()

Unnamed: 0,index,A,B,C,D
0,za3q8aKY75,0.073783,0.500928,-0.020028,0.180255
1,smh3FXCrba,1.032454,-1.328206,-0.000706,0.520992
2,55rsIJ5g0L,-1.158359,-0.781921,0.809636,0.145702
3,tSGtR03qU7,0.019133,1.334082,-0.481228,0.766964
4,CqT2m7XjDA,-0.641162,-0.740153,1.524584,-0.868476


In [450]:
df = df.rename(columns={'index' : 'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,za3q8aKY75,0.073783,0.500928,-0.020028,0.180255
1,smh3FXCrba,1.032454,-1.328206,-0.000706,0.520992
2,55rsIJ5g0L,-1.158359,-0.781921,0.809636,0.145702
3,tSGtR03qU7,0.019133,1.334082,-0.481228,0.766964
4,CqT2m7XjDA,-0.641162,-0.740153,1.524584,-0.868476


In [451]:
df_backup = df.copy(deep=True)

In [452]:
# Menghapus (drop) setiap kolom yang mengandung missing values
df = df.dropna(axis='columns')
df.head()

Unnamed: 0,Z
0,za3q8aKY75
1,smh3FXCrba
2,55rsIJ5g0L
3,tSGtR03qU7
4,CqT2m7XjDA


In [453]:
# Menghapus (drop) setiap baris yang mengandung missing values
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
0,za3q8aKY75,0.073783,0.500928,-0.020028,0.180255
1,smh3FXCrba,1.032454,-1.328206,-0.000706,0.520992
2,55rsIJ5g0L,-1.158359,-0.781921,0.809636,0.145702
3,tSGtR03qU7,0.019133,1.334082,-0.481228,0.766964
4,CqT2m7XjDA,-0.641162,-0.740153,1.524584,-0.868476


In [454]:
# Persentase missing values untuk tiap kolom
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.066667
B    0.100000
C    0.166667
D    0.066667
dtype: float64

In [455]:
# Menghapus (drop) setiap kolom yang mengandung missing values berdasarkan treshold / ambang batas (presentase missing value)
treshold = len(df)*0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,B,D
0,za3q8aKY75,0.073783,0.500928,0.180255
1,smh3FXCrba,1.032454,-1.328206,0.520992
2,55rsIJ5g0L,-1.158359,-0.781921,0.145702
3,tSGtR03qU7,0.019133,1.334082,0.766964
4,CqT2m7XjDA,-0.641162,-0.740153,-0.868476


## #08: Memeriksa kesamaan antar kolom (series) pada data frame

In [456]:
# Persiapan data frame
data  = {'A':[15, 15, 18, np.nan, 12],'B':[15, 15, 18, np.nan, 12]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,15.0,15.0
1,15.0,15.0
2,18.0,18.0
3,,
4,12.0,12.0


In [457]:
# Mengenal pandas series
# data frame merupakan kumpulan dari data series
df['A']

0    15.0
1    15.0
2    18.0
3     NaN
4    12.0
Name: A, dtype: float64

In [458]:
type(df['A'])

pandas.core.series.Series

In [459]:
type(df)

pandas.core.frame.DataFrame

In [460]:
# Memeriksa kesamaan dengan operator == (Tidak di rekomendasikan karena apabila dalam data teradap NaN maka akan tidak terbaca dengan baik)
df['A'] == df['B']

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [461]:
# Memeriksa kesamaan dengan method equals() | Lebih di rekomendasikan
df['A'].equals(df['B'])

True

In [462]:
# Memeriksa kesamaan antar dua data frame
df1 = df.copy(deep=True)
df.equals(df1)

True

In [463]:
df == df1

Unnamed: 0,A,B
0,True,True
1,True,True
2,True,True
3,False,False
4,True,True


## #09: Membagi data frame menjadi dua secara acak

In [464]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,10,11,19,5,14
1,14,15,8,10,14
2,9,9,10,9,9
3,7,5,17,3,1
4,5,16,2,16,11
5,14,12,16,1,12
6,10,11,15,9,19
7,10,4,19,6,16
8,6,10,8,7,4
9,2,15,2,1,7


In [465]:
# Membagi dua data frame menjadi dua secara acak berdasarkan proporsi tertentu
df.shape

(10, 5)

In [466]:
proporsi = 0.7
df_1 = df.sample(frac=proporsi)
df_2 = df.drop(df_1.index)

print(f'df_1 Shape: {df_1.shape}')
print(f'df_2 Shape: {df_2.shape}')

df_1 Shape: (7, 5)
df_2 Shape: (3, 5)


In [467]:
df_1

Unnamed: 0,A,B,C,D,E
9,2,15,2,1,7
8,6,10,8,7,4
5,14,12,16,1,12
1,14,15,8,10,14
2,9,9,10,9,9
3,7,5,17,3,1
4,5,16,2,16,11


In [468]:
df_2

Unnamed: 0,A,B,C,D,E
0,10,11,19,5,14
6,10,11,15,9,19
7,10,4,19,6,16


## #10: Mengganti nama (label) kolom pada data frame berdasarkan pola


In [469]:
# Persiapan data frame
df = pd.read_csv('./Dataset/titanicfull.csv')
df.columns = ['Pclass','Survival status','full Name','Sex ',' Age','Sib SP', 'parch','Ticket','Fare','Cabin','Embarked']
df_backup = df.copy(deep=True)
df.head()

Unnamed: 0,Pclass,Survival status,full Name,Sex,Age,Sib SP,parch,Ticket,Fare,Cabin,Embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [470]:
# Menggunakan lowercase untuk nama kolom dan mengganti spasi dengan _
df.columns = df.columns.str.replace(' ','_').str.lower()
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex_,_age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [471]:
# Memangkas kelebihan spasi pada nama kolom
df = df_backup.copy(deep=True)
df.columns = df.columns.str.lower().str.strip().str.replace(' ','_') 
# Remark:
# str.lower() = merubah menjadi huruf kecil
# str.strip() = menghilangkan kelebihan sepasi di awal ataupun diakhir kata
# str.replace() = mengganti huruf
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex,age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


## #11: Seleksi kolom dan baris pada data frame menggunakan loc

In [472]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,9,2,15,9,2
1,9,7,11,19,14
2,8,8,11,19,9
3,17,11,9,6,9
4,2,4,12,15,12
5,12,5,11,2,12
6,19,4,13,13,13
7,5,10,15,6,6
8,13,17,8,12,17
9,4,9,11,18,18


In [473]:
# Seleksi kolom dan baris menggunakan loc
df.loc[[0,3,4],['B','E']] #bagian pertama untuk baris, bagian kedua untuk kolom


Unnamed: 0,B,E
0,2,2
3,11,9
4,4,12


In [474]:
# Seleksi baris dengan kondisi
df.loc[df['B']>10,['B','D','E']]

Unnamed: 0,B,D,E
3,11,6,9
8,17,12,17


In [475]:
# Slicing data frame dengan loc
df.loc[0:4, 'B':'D'] #slicing pada loc dataframe start dan End index sifatnya inclusif artinya angka tersebut di turut sertakan | Slicing list hanya start index say inclusive

Unnamed: 0,B,C,D
0,2,15,9
1,7,11,19
2,8,11,19
3,11,9,6
4,4,12,15


## #12: Membentuk kolom bertipe datetime dari beberapa kolom lain pada pandas data frame

In [476]:
# Persiapan data frame
data = {'day':[1,2,10,25,12], 'month':[1,2,4,5,6], 'year':[2000,2001,2010,2015,2020]} #data dictionary
df = pd.DataFrame(data)
df

Unnamed: 0,day,month,year
0,1,1,2000
1,2,2,2001
2,10,4,2010
3,25,5,2015
4,12,6,2020


In [477]:
# Membentuk kolom bertipe datetime
df['penanggalan'] = pd.to_datetime(df[['day','month','year']])
df

Unnamed: 0,day,month,year,penanggalan
0,1,1,2000,2000-01-01
1,2,2,2001,2001-02-02
2,10,4,2010,2010-04-10
3,25,5,2015,2015-05-25
4,12,6,2020,2020-06-12


In [478]:
df.dtypes

day                     int64
month                   int64
year                    int64
penanggalan    datetime64[ns]
dtype: object

## #13: Konversi nilai numerik ke dalam kategori pada data frame