In [6]:
import pandas as pd
import numpy as np

In [35]:
df = pd.DataFrame({
    "cust_id": ['c1','c2','c3','c4','c5','c6','c7','c8','c9','c10','c11'],
    "item_bought": [10,7,5,4,10000,10,5,5,7,np.nan,np.nan],
    "city": ["bandung",'jakarta','bogor','tangerang','bandung','bogor','bogor','bogor','bandung','jakarta',np.nan]
})
df

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung
9,c10,,jakarta


DATA CLEANSING

In [36]:
##Check Missing Values

df.isna().sum()

cust_id        0
item_bought    2
city           1
dtype: int64

In [37]:
##Handle Missing Values

In [38]:
##Drop

df.dropna()

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung


In [39]:
df

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung
9,c10,,jakarta


In [40]:
## Fill

In [41]:
##Numeric
df_clean = df.copy()
df_clean['item_bought'] = df['item_bought'].fillna(0)
df_clean

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung
9,c10,0.0,jakarta


In [42]:
df_clean = df.copy()
df_clean['item_bought'] = df['item_bought'].fillna(df['item_bought'].mean())
df_clean

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung
9,c10,1117.0,jakarta


In [43]:
df_clean = df.copy()
df_clean['item_bought'] = df['item_bought'].fillna(df['item_bought'].median())
df_clean

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung
9,c10,7.0,jakarta


In [44]:
## Categorikal (hanya bisa makai modus)

df_clean = df.copy()
df_clean['city'] = df['city'].fillna(df['city'].mode()[0])
df_clean

Unnamed: 0,cust_id,item_bought,city
0,c1,10.0,bandung
1,c2,7.0,jakarta
2,c3,5.0,bogor
3,c4,4.0,tangerang
4,c5,10000.0,bandung
5,c6,10.0,bogor
6,c7,5.0,bogor
7,c8,5.0,bogor
8,c9,7.0,bandung
9,c10,,jakarta


Outlier

In [45]:
df = pd.DataFrame({
    "cust_id": ['c1','c2','c3','c4','c5','c6','c7','c8','c9','c10','c11'],
    "item_bought": [10,7,5,4,10000,10,5,5,7,4,5],
    "city": ["bandung",'jakarta','bogor','tangerang','bandung','bogor','bogor','bogor','bandung','jakarta','bogor']
})
df

Unnamed: 0,cust_id,item_bought,city
0,c1,10,bandung
1,c2,7,jakarta
2,c3,5,bogor
3,c4,4,tangerang
4,c5,10000,bandung
5,c6,10,bogor
6,c7,5,bogor
7,c8,5,bogor
8,c9,7,bandung
9,c10,4,jakarta


In [46]:
##Hanya numerik yg bisa ada outlier

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cust_id      11 non-null     object
 1   item_bought  11 non-null     int64 
 2   city         11 non-null     object
dtypes: int64(1), object(2)
memory usage: 396.0+ bytes


In [48]:
##Check Outlier
##Makai cara IQR(inter quartile range)

q1 = df['item_bought'].quantile(0.25)
q3 = df['item_bought'].quantile(0.75)

print(q1)
print(q3)

5.0
8.5


In [49]:
iqr = q3 - q1
print(iqr)

3.5


In [50]:
##Batas MIN & MAX (pagar bawah dan atas)
fence_low = q1 - 1.5*iqr
fence_high = q3 + 1.5*iqr
print(fence_low)
print(fence_high)

-0.25
13.75


In [51]:
df[(df['item_bought'] < fence_low) | (df['item_bought'] > fence_high)]

Unnamed: 0,cust_id,item_bought,city
4,c5,10000,bandung


In [55]:
## Handling (opsi satu biarkan dan opsi 2 hapus)
df_clean = df[(df['item_bought'] >= fence_low) & (df['item_bought'] <= fence_high)]
df_clean

Unnamed: 0,cust_id,item_bought,city
0,c1,10,bandung
1,c2,7,jakarta
2,c3,5,bogor
3,c4,4,tangerang
5,c6,10,bogor
6,c7,5,bogor
7,c8,5,bogor
8,c9,7,bandung
9,c10,4,jakarta
10,c11,5,bogor


Duplicated Data

In [60]:
df = pd.DataFrame({
    "cust_id": ['c1','c2','c3','c4','c5','c6','c7','c8','c9','c10','c7'],
    "item_bought": [10,7,5,4,10000,10,5,5,7,4,5],
    "city": ["bandung",'jakarta','bogor','tangerang','bandung','bogor','bogor','bogor','bandung','jakarta','bogor']
})
df

Unnamed: 0,cust_id,item_bought,city
0,c1,10,bandung
1,c2,7,jakarta
2,c3,5,bogor
3,c4,4,tangerang
4,c5,10000,bandung
5,c6,10,bogor
6,c7,5,bogor
7,c8,5,bogor
8,c9,7,bandung
9,c10,4,jakarta


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cust_id      11 non-null     object
 1   item_bought  11 non-null     int64 
 2   city         11 non-null     object
dtypes: int64(1), object(2)
memory usage: 396.0+ bytes


In [62]:
## Check Duplicated
df.duplicated().sum()

np.int64(1)

In [65]:
## Handle Duplicated (mirip distinc)

df.drop_duplicates()

Unnamed: 0,cust_id,item_bought,city
0,c1,10,bandung
1,c2,7,jakarta
2,c3,5,bogor
3,c4,4,tangerang
4,c5,10000,bandung
5,c6,10,bogor
6,c7,5,bogor
7,c8,5,bogor
8,c9,7,bandung
9,c10,4,jakarta
