# Belajar pandas

## #01: Menyertakan prefix dan suffix pada kolom data frame

In [74]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.3.3
1.21.2


In [75]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10, size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,9,9,2,7,8
1,6,3,8,8,4
2,8,1,3,5,1
3,3,7,4,1,1
4,4,9,6,5,9


In [76]:
tuple('ABCDE')

('A', 'B', 'C', 'D', 'E')

In [77]:
# Menyertakan prefix kolom
df.add_prefix('kolom_')

Unnamed: 0,kolom_A,kolom_B,kolom_C,kolom_D,kolom_E
0,9,9,2,7,8
1,6,3,8,8,4
2,8,1,3,5,1
3,3,7,4,1,1
4,4,9,6,5,9


In [78]:
#Menyertakan sufix kolom
df.add_suffix('_field')

Unnamed: 0,A_field,B_field,C_field,D_field,E_field
0,9,9,2,7,8
1,6,3,8,8,4
2,8,1,3,5,1
3,3,7,4,1,1
4,4,9,6,5,9


## #02: Pemilihan baris (rows selection) pada data frame

In [79]:
# Persiapan data frame
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,5, size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,4,2,4,4,4
1,1,3,4,3,1
2,4,4,1,1,4
3,1,1,1,4,2
4,4,4,2,2,2
5,4,2,3,2,2
6,3,1,1,2,2
7,1,1,4,1,3
8,2,1,3,2,1
9,1,2,1,3,3


In [80]:
# Selection dengan operator logika | (or)
df[(df['A'] == 1) | (df['A'] == 3)]

Unnamed: 0,A,B,C,D,E
1,1,3,4,3,1
3,1,1,1,4,2
6,3,1,1,2,2
7,1,1,4,1,3
9,1,2,1,3,3


In [81]:
# Selection dengan fungsi isin()
df[df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
1,1,3,4,3,1
3,1,1,1,4,2
6,3,1,1,2,2
7,1,1,4,1,3
9,1,2,1,3,3


In [82]:
# Mengenal operator negasi ~
df[~df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
0,4,2,4,4,4
2,4,4,1,1,4
4,4,4,2,2,2
5,4,2,3,2,2
8,2,1,3,2,1


## #03: Konversi tipe data string ke numerik pada kolom data frame

In [83]:
# Persiapan data frame
data = {'col1':['1','2','3','teks'],
        'col2':['1','2','3','4']}

df=pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [84]:
df.dtypes

col1    object
col2    object
dtype: object

In [85]:
# konversi tipe data dengan fungsi astype()
df_x = df.astype({'col2':'int64'})
df_x

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [86]:
df_x.dtypes

col1    object
col2     int64
dtype: object

In [87]:
# Konversi tipe data numerik dengan fungsi to_numeric()
df.apply(pd.to_numeric, errors ='coerce')

Unnamed: 0,col1,col2
0,1.0,1
1,2.0,2
2,3.0,3
3,,4


## #04: Pemiliihan kolom (columns selection) pada pandas data frame berdasarkan tipe data

In [88]:
# Persiapan data frame
n_rows = 5
n_cols = 2
cols = ['bil_pecahan','bil_bulat']

df = pd.DataFrame(np.random.randint(1,20,size=(n_rows, n_cols)), columns=cols)
df['bil_pecahan']=df['bil_pecahan'].astype('float')

df.index=pd.util.testing.makeDateIndex(n_rows, freq='H')
df=df.reset_index()

df['teks']=list('ABCDE')

df

Unnamed: 0,index,bil_pecahan,bil_bulat,teks
0,2000-01-01 00:00:00,7.0,9,A
1,2000-01-01 01:00:00,12.0,3,B
2,2000-01-01 02:00:00,13.0,16,C
3,2000-01-01 03:00:00,3.0,9,D
4,2000-01-01 04:00:00,19.0,7,E


In [89]:
df.dtypes

index          datetime64[ns]
bil_pecahan           float64
bil_bulat               int32
teks                   object
dtype: object

In [90]:
# memilih kolom bertipe data numerik
df.select_dtypes(include='number')

Unnamed: 0,bil_pecahan,bil_bulat
0,7.0,9
1,12.0,3
2,13.0,16
3,3.0,9
4,19.0,7


In [91]:
df.select_dtypes(include='float')

Unnamed: 0,bil_pecahan
0,7.0
1,12.0
2,13.0
3,3.0
4,19.0


In [92]:
df.select_dtypes(include='int')

Unnamed: 0,bil_bulat
0,9
1,3
2,16
3,9
4,7


In [93]:
# Memilih kolom bertipe data string atau object
df.select_dtypes(include='object')

Unnamed: 0,teks
0,A
1,B
2,C
3,D
4,E


In [94]:
# Memilih kolom bertipe data datetime
df.select_dtypes(include='datetime')

Unnamed: 0,index
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00
4,2000-01-01 04:00:00


In [95]:
# Memiliih kolom dengan kombinasi tipe data
df.select_dtypes(include=['number','object'])

Unnamed: 0,bil_pecahan,bil_bulat,teks
0,7.0,9,A
1,12.0,3,B
2,13.0,16,C
3,3.0,9,D
4,19.0,7,E


## #05: Membalik urutan baris dan kolom pada data frame

In [96]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,9,5,4,5,1
1,7,8,8,9,8
2,6,5,9,2,9
3,1,6,8,5,8
4,9,4,9,9,7


In [97]:
# Membalik urutan kolom
df.loc[:, ::-1]

Unnamed: 0,E,D,C,B,A
0,1,5,4,5,9
1,8,9,8,8,7
2,9,2,9,5,6
3,8,5,8,6,1
4,7,9,9,4,9


In [98]:
# Membalik urutan baris
df.loc[::-1]

Unnamed: 0,A,B,C,D,E
4,9,4,9,9,7
3,1,6,8,5,8
2,6,5,9,2,9
1,7,8,8,9,8
0,9,5,4,5,1


In [99]:
# Membalik urutan baris dan melakukan penyesuaian ulang index
df.loc[::-1].reset_index(drop=True)

Unnamed: 0,A,B,C,D,E
0,9,4,9,9,7
1,1,6,8,5,8
2,6,5,9,2,9
3,7,8,8,9,8
4,9,5,4,5,1


## #06: Mengganti nama (label0 kolom pada data frame

In [100]:
# Persiapan data frame
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1,10,size=(n_rows, n_cols)), columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,6,4,3,3,5
1,7,4,5,1,6
2,1,5,2,7,8
3,3,7,2,5,7
4,2,8,1,7,9


In [101]:
# Mengganti nama (label) untuk sebuah kolom pada data frame
df.rename(columns={'C' : 'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,6,4,3,3,5
1,7,4,5,1,6
2,1,5,2,7,8
3,3,7,2,5,7
4,2,8,1,7,9


In [102]:
# Mengganti nama (label) untuk banyak kolom pada data frame
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,6,4,3,3,5
1,7,4,5,1,6
2,1,5,2,7,8
3,3,7,2,5,7
4,2,8,1,7,9


## #07: Menghapus missing values pada data frame (NaN)

In [103]:
# Persiapan data frame
df = pd.util.testing.makeMissingDataframe().reset_index() #formula untuk bikin dummy data dengan missing values
df.head()

Unnamed: 0,index,A,B,C,D
0,ws5gpXmBlY,-0.266944,0.789077,-1.197561,-1.165939
1,pAJXjOHaSH,,,-0.358669,0.253085
2,ZFt9VJLTEj,1.180698,0.736708,,-0.233259
3,4NuDR6LFmf,0.212717,0.533703,0.095844,0.70942
4,knnXqjNwCi,0.609275,-1.704314,1.064728,-1.565974


In [104]:
df = df.rename(columns={'index' : 'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,ws5gpXmBlY,-0.266944,0.789077,-1.197561,-1.165939
1,pAJXjOHaSH,,,-0.358669,0.253085
2,ZFt9VJLTEj,1.180698,0.736708,,-0.233259
3,4NuDR6LFmf,0.212717,0.533703,0.095844,0.70942
4,knnXqjNwCi,0.609275,-1.704314,1.064728,-1.565974


In [105]:
df_backup = df.copy(deep=True)

In [106]:
# Menghapus (drop) setiap kolom yang mengandung missing values
df = df.dropna(axis='columns')
df.head()

Unnamed: 0,Z
0,ws5gpXmBlY
1,pAJXjOHaSH
2,ZFt9VJLTEj
3,4NuDR6LFmf
4,knnXqjNwCi


In [107]:
# Menghapus (drop) setiap baris yang mengandung missing values
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
0,ws5gpXmBlY,-0.266944,0.789077,-1.197561,-1.165939
3,4NuDR6LFmf,0.212717,0.533703,0.095844,0.70942
4,knnXqjNwCi,0.609275,-1.704314,1.064728,-1.565974
5,2IJWtJBO8W,-0.983316,0.18547,-2.144101,0.498683
6,UO9OApjYd7,-0.975821,-0.030661,0.174548,0.439084


In [108]:
# Persentase missing values untuk tiap kolom
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.066667
B    0.166667
C    0.100000
D    0.066667
dtype: float64

In [109]:
# Menghapus (drop) setiap kolom yang mengandung missing values berdasarkan treshold / ambang batas (presentase missing value)
treshold = len(df)*0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,C,D
0,ws5gpXmBlY,-0.266944,-1.197561,-1.165939
1,pAJXjOHaSH,,-0.358669,0.253085
2,ZFt9VJLTEj,1.180698,,-0.233259
3,4NuDR6LFmf,0.212717,0.095844,0.70942
4,knnXqjNwCi,0.609275,1.064728,-1.565974


## #08: Memeriksa kesamaan antar kolom (series) pada data frame