# Import Libraries

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [22]:
# change the path according to your file location in your Google Drive
df = pd.read_csv('/content/drive/MyDrive/Rakamin/data/botak.csv')

In [23]:
# informasi general tentang dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7991 entries, 0 to 7990
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   umur           7991 non-null   float64
 1   jenis_kelamin  7982 non-null   object 
 2   pekerjaan      7924 non-null   object 
 3   provinsi       7991 non-null   object 
 4   gaji           7968 non-null   float64
 5   is_menikah     7991 non-null   int64  
 6   is_keturunan   7976 non-null   float64
 7   berat          7952 non-null   float64
 8   tinggi         7991 non-null   float64
 9   sampo          7934 non-null   object 
 10  is_merokok     7991 non-null   int64  
 11  pendidikan     7991 non-null   object 
 12  botak_prob     7991 non-null   float64
dtypes: float64(6), int64(2), object(5)
memory usage: 811.7+ KB


Pengamatan\:
1. Data terdiri dari 7991 baris
2. Tampak beberapa kolom masih memiliki null/missing values (Non-Null Count < jumlah baris)
3. Sepertinya tidak ada issue yang mencolok pada tipe data untuk setiap kolom (sudah sesuai)

## Cuplikan Data

In [24]:
# 5 baris teratas dari data
df.head()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,botak_prob
0,42.0,Laki-laki,PNS,Padang,7864005.0,0,0.0,61.928685,152.174164,Deadbuoy,1,S1,0.487655
1,33.0,Laki-laki,Pegawai swasta,Palangkaraya,6492662.0,0,0.0,49.374507,152.822969,Moonsilk,1,SMA,0.29727
2,59.0,Laki-laki,Freelance,Serang,8086303.0,1,0.0,59.072807,159.911047,Merpati,1,S1,0.467848
3,38.0,Laki-laki,Pegawai swasta,Gorontalo,11473210.0,0,1.0,68.338014,162.558997,Moonsilk,0,S1,0.616418
4,33.0,Perempuan,Freelance,Tanjungselor,14759420.0,0,0.0,53.769996,154.57951,Pantone,1,S2,0.115439


In [25]:
# pengelompokan kolom berdasarkan jenisnya
nums = ['umur', 'gaji', 'is_menikah', 'is_keturunan', 'berat', 'tinggi', 'is_merokok', 'botak_prob']
cats = ['jenis_kelamin', 'pekerjaan', 'provinsi', 'sampo', 'pendidikan']

# Missing Data

In [26]:
# jumlah entry NULL di setiap kolom
df.isna().sum()

umur              0
jenis_kelamin     9
pekerjaan        67
provinsi          0
gaji             23
is_menikah        0
is_keturunan     15
berat            39
tinggi            0
sampo            57
is_merokok        0
pendidikan        0
botak_prob        0
dtype: int64

In [27]:
# drop baris dengan missing values
df = df.dropna()


In [28]:
# cek apakah sudah berhasil
df.isna().sum()

umur             0
jenis_kelamin    0
pekerjaan        0
provinsi         0
gaji             0
is_menikah       0
is_keturunan     0
berat            0
tinggi           0
sampo            0
is_merokok       0
pendidikan       0
botak_prob       0
dtype: int64

# Duplicated Data

In [29]:
# cek jumlah duplicated rows
# dari semua kolom
df.duplicated().sum()

99

In [30]:
# drop duplicated rows
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

99
0


# Outliers Removal
Menggunakan konsep IQR

In [31]:
print(f'Jumlah baris sebelum memfilter outlier: {len(df)}')

for col in ['umur','gaji', 'berat', 'tinggi']:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3-Q1
  low_limit = Q1 - 1.5*IQR
  high_limit = Q3 + 1.5*IQR
  df = df[(df[col] >= low_limit) & (df[col] <= high_limit)]

print(f'Jumlah baris setelah memfilter outlier: {len(df)}')

Jumlah baris sebelum memfilter outlier: 7682
Jumlah baris setelah memfilter outlier: 6989


# Feature Encoding

In [32]:
# pengecekan nilai/entri dari kolom-kolom kategorikal
cats_updated = ['jenis_kelamin', 'pekerjaan', 'sampo', 'pendidikan']

for col in cats_updated:
  print(f'value counts of column {col}')
  print(df[col].value_counts())
  print('---'*10, '\n')

value counts of column jenis_kelamin
Laki-laki    4651
Perempuan    2338
Name: jenis_kelamin, dtype: int64
------------------------------ 

value counts of column pekerjaan
PNS               2818
Pegawai swasta    2797
Freelance         1096
Pengangguran       278
Name: pekerjaan, dtype: int64
------------------------------ 

value counts of column sampo
Merpati            1448
Pantone            1409
Moonsilk           1384
Deadbuoy           1381
Shoulder & Head    1367
Name: sampo, dtype: int64
------------------------------ 

value counts of column pendidikan
S1     3845
SMA    1773
S2      576
SMP     475
SD      191
S3      129
Name: pendidikan, dtype: int64
------------------------------ 



Strategi encoding
* `jenis_kelamin` & `pendidikan` \: label encoding
* `pekerjaan` & 'sampo` \: One Hot Encoding 

In [33]:
# label encoding
# untuk kolom jenis_kelamin dan pendidikan
mapping_jenis_kelamin = {
    'Perempuan' : 0,
    'Laki-laki' : 1
}

mapping_pendidikan = {
    'SD' : 0,
    'SMP' : 1,
    'SMA' : 2,
    'S1' : 3,
    'S2' : 4,
    'S3' : 5
}

df['jenis_kelamin'] = df['jenis_kelamin'].map(mapping_jenis_kelamin)
df['pendidikan'] = df['pendidikan'].map(mapping_pendidikan)

In [34]:
# One hot encoder pekerjaan dan sampo
# WARNING: run code ini sekali saja
for cat in ['pekerjaan', 'sampo']:
    onehots = pd.get_dummies(df[cat], prefix=cat)
    df = df.join(onehots)

# Drop kolom-kolom yang outdated

In [35]:
# drop kolom pekerjaan dan sampo yang asli (karena sudah di-encoding)
df = df.drop(columns=['pekerjaan', 'sampo'])

In [36]:
# cek 'n ricek penampakan data setelah preprocessing
df.head()

Unnamed: 0,umur,jenis_kelamin,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,is_merokok,pendidikan,botak_prob,pekerjaan_Freelance,pekerjaan_PNS,pekerjaan_Pegawai swasta,pekerjaan_Pengangguran,sampo_Deadbuoy,sampo_Merpati,sampo_Moonsilk,sampo_Pantone,sampo_Shoulder & Head
0,42.0,1,Padang,7864005.0,0,0.0,61.928685,152.174164,1,3,0.487655,0,1,0,0,1,0,0,0,0
1,33.0,1,Palangkaraya,6492662.0,0,0.0,49.374507,152.822969,1,2,0.29727,0,0,1,0,0,0,1,0,0
2,59.0,1,Serang,8086303.0,1,0.0,59.072807,159.911047,1,3,0.467848,1,0,0,0,0,1,0,0,0
3,38.0,1,Gorontalo,11473210.0,0,1.0,68.338014,162.558997,0,3,0.616418,0,0,1,0,0,0,1,0,0
4,33.0,0,Tanjungselor,14759420.0,0,0.0,53.769996,154.57951,1,4,0.115439,1,0,0,0,0,0,0,1,0


In [37]:
# informasi dataframe yang sudah di-preprocessing
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6989 entries, 0 to 7911
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   umur                      6989 non-null   float64
 1   jenis_kelamin             6989 non-null   int64  
 2   provinsi                  6989 non-null   object 
 3   gaji                      6989 non-null   float64
 4   is_menikah                6989 non-null   int64  
 5   is_keturunan              6989 non-null   float64
 6   berat                     6989 non-null   float64
 7   tinggi                    6989 non-null   float64
 8   is_merokok                6989 non-null   int64  
 9   pendidikan                6989 non-null   int64  
 10  botak_prob                6989 non-null   float64
 11  pekerjaan_Freelance       6989 non-null   uint8  
 12  pekerjaan_PNS             6989 non-null   uint8  
 13  pekerjaan_Pegawai swasta  6989 non-null   uint8  
 14  pekerjaa