In [1]:
import pandas as pd
import numpy as np


In [3]:
# Contoh data
data = {
    'area': [100, 200, np.nan, 150, 300, np.nan],
    'floor': [1, 2, 3, np.nan, 5, 6],
    'price': [50000, 100000, 75000, 200000, 150000, 300000],
    'address': ['Address 1', 'Address 2', 'Address 3', 'Address 4', 'Address 5', 'Address 6'],
    'url': ['url1', 'url2', 'url3', 'url4', 'url5', 'url6']
}

df = pd.DataFrame(data)
df

Unnamed: 0,area,floor,price,address,url
0,100.0,1.0,50000,Address 1,url1
1,200.0,2.0,100000,Address 2,url2
2,,3.0,75000,Address 3,url3
3,150.0,,200000,Address 4,url4
4,300.0,5.0,150000,Address 5,url5
5,,6.0,300000,Address 6,url6


In [4]:
# Mengidentifikasi nilai yang hilang
print("Nilai yang hilang sebelum imputasi:")
df.isnull().sum()

Nilai yang hilang sebelum imputasi:


area       2
floor      1
price      0
address    0
url        0
dtype: int64

In [5]:
# Mengimputasi nilai yang hilang dengan median atau mean
df['area'].fillna(df['area'].median(), inplace=True)
df['floor'].fillna(df['floor'].median(), inplace=True)

print("\nNilai yang hilang setelah imputasi:")
df.isnull().sum()


Nilai yang hilang setelah imputasi:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['area'].fillna(df['area'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['floor'].fillna(df['floor'].median(), inplace=True)


area       0
floor      0
price      0
address    0
url        0
dtype: int64

In [6]:
# Menghitung IQR
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Menentukan batas bawah dan atas untuk outlier
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Mengidentifikasi outlier
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
print("\nOutlier:")
print(outliers)

# Mengatasi outlier (misalnya, dengan membuang atau mengimputasi)
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
print("\nData setelah menghapus outlier:")
print(df)



Outlier:
Empty DataFrame
Columns: [area, floor, price, address, url]
Index: []

Data setelah menghapus outlier:
    area  floor   price    address   url
0  100.0    1.0   50000  Address 1  url1
1  200.0    2.0  100000  Address 2  url2
2  175.0    3.0   75000  Address 3  url3
3  150.0    3.0  200000  Address 4  url4
4  300.0    5.0  150000  Address 5  url5
5  175.0    6.0  300000  Address 6  url6


In [7]:
import re

# Contoh validasi sederhana untuk kolom 'address' dan 'url'
def is_valid_address(address):
    return isinstance(address, str) and len(address.strip()) > 0

def is_valid_url(url):
    regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
        r'localhost|' # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return re.match(regex, url) is not None

# Verifikasi keakuratan data
df['valid_address'] = df['address'].apply(is_valid_address)
df['valid_url'] = df['url'].apply(is_valid_url)

print("\nVerifikasi keakuratan data:")
print(df[['address', 'valid_address', 'url', 'valid_url']])



Verifikasi keakuratan data:
     address  valid_address   url  valid_url
0  Address 1           True  url1      False
1  Address 2           True  url2      False
2  Address 3           True  url3      False
3  Address 4           True  url4      False
4  Address 5           True  url5      False
5  Address 6           True  url6      False
