<div align="center">
  <h1>Preprocessing</h1>
  <h2>Data Hunian Rumah dan Apartemen di Jakarta Pusat</h2>
</div>

- Install dan import library

In [1]:
## jika packages berikut belum terinstall, un-comment dan jalankan cell ini untuk menginstallnya
# %pip install --upgrade pip --quiet
# %pip install pandas datetime locale --quiet

# note: jika ada yang belum terinstall lakukan command di cell baru:
# %pip install <nama package>

In [2]:
import pandas as pd
from datetime import datetime
import locale, re, os

- Definisi global dan isi atribut tiap tabel

In [3]:
__CURRENT_DIRECTORY__   = os.getenv('CURRENT_DIR', os.getcwd())
__PATH_FOLDER__         = os.path.abspath(os.path.join(__CURRENT_DIRECTORY__, '..', 'data'))

__HUNIAN__ = ['id_iklan',
               'tipe_properti',
               'luas_bangunan',
               'kamar_tidur',
               'kamar_mandi',
               'lokasi',
               'sertifikat',
               'tipe_iklan',
               'periode_kepemilikan',
               'harga',
               'diperbarui',
               'id_agen'
               ]

__RUMAH__ = ['id_iklan',
              'luas_tanah',
              'carport',
              'taman'
              ]

__APARTEMEN__ = ['id_iklan',
                  'kondisi_properti',
                  'kondisi_perabotan'
                  ]

__AGEN__ = ['id_agen',
             'nama_agen',
             'nomor_telepon',
             'terjual',
             'tersewa',
             'nama_perusahaan'
             ]

__PERUSAHAAN__ = ['nama_perusahaan',
                   'alamat'
                   ]

- Fungsi untuk membantu preprocessing

In [4]:
def id_iklan(value):
    pattern = r'^[a-zA-Z]{3}\d+$'
    return bool(re.match(pattern, value))

def extract_number(value):
    if value is not None and value != -1:
        match = re.search(r'\d+', value)
        if match:
            return int(match.group())
    return None

def convert_harga(harga):
    if harga:
        match = re.search(r'([\d,]+)\s*(Miliar|Juta|Ribu)', harga)
        if match:
            value   = float(match.group(1).replace(',', '.'))
            unit    = match.group(2)
            if unit == 'Miliar':
                return int(value * 1e9)
            elif unit == 'Juta':
                return int(value * 1e6)
            elif unit == 'Ribu':
                return int(value * 1e3)
    return None

def fill_periode_kepemilikan(row):
    if row['tipe_iklan'] == 'jual':
        return 'Pemilik'
    return row['periode_kepemilikan']

def convert_to_datetime(text):
    locale.setlocale(locale.LC_TIME, 'id_ID.UTF-8')
    date_obj    = datetime.strptime(text, '%d %B %Y')
    date        = date_obj.strftime('%Y-%m-%d')
    return date

def update_agen(row):
    if 'Independent Property Agent' in row['nama_perusahaan']:
        row['nama_perusahaan']  = None
        row['alamat']           = None
    return row

def clean_phone_number(phone_number):
    if phone_number:
        cleaned_number = re.sub(r'\D', '', phone_number)
        return cleaned_number.replace('\t', '').replace(' ', '')
    return phone_number


- Preprocessing

In [5]:
def preprocessing(df):
    df = df.copy() 
    df = df[df['id_iklan'].apply(id_iklan)]
    df['harga']                 = df['harga'].apply(convert_harga)
    df['carport']               = df['carport'].fillna(0)
    df['diperbarui']            = df['diperbarui'].apply(convert_to_datetime)
    df['sertifikat']            = df['sertifikat'].fillna('unknown')
    df['luas_bangunan']         = df['luas_bangunan'].apply(extract_number)
    df['kondisi_properti']      = df['kondisi_properti'].fillna('unknown')
    df['kondisi_perabotan']     = df['kondisi_perabotan'].fillna('unknown')
    df['periode_kepemilikan']   = df.apply(fill_periode_kepemilikan, axis=1)
    
    df                  = df.dropna()
    df['luas_tanah']    = df['luas_tanah'].apply(extract_number)
    df['timestamp']     = df['timestamp'].astype(str)
    df['nomor_telepon'] = df['nomor_telepon'].astype(str)
    df['nomor_telepon'] = df['nomor_telepon'].apply(clean_phone_number)
        
    numeric_cols = ['luas_bangunan', 'kamar_tidur', 'kamar_mandi', 
                    'harga', 'luas_tanah', 'carport', 'id_agen',
                    'nomor_telepon', 'terjual', 'tersewa']
    
    for col in numeric_cols:
        df[col] = df[col].astype(pd.Int64Dtype())  
        
    df = df.apply(update_agen, axis=1)
    
    return df


- Menyimpan data hasil preprocessing

In [6]:
def store(df, hunian_cols, rumah_cols, apartemen_cols, agen_cols, perusahaan_cols):
    hunian      = df[hunian_cols]
    rumah       = df[df['tipe_properti'] == 'Rumah'][rumah_cols]
    apartemen   = df[df['tipe_properti'] == 'Apartemen'][apartemen_cols]

    agen        = df[agen_cols]
    agen        = agen.drop_duplicates(subset=['id_agen'], keep='first')
    
    perusahaan  = df[perusahaan_cols]
    perusahaan  = perusahaan.drop_duplicates(subset=['nama_perusahaan'], keep='first')
    perusahaan  = perusahaan.dropna()
    
    df.to_json          (os.path.join(__PATH_FOLDER__, 'data_clean.json'),  orient='records', lines=False, indent=4, date_format='iso')
    hunian.to_json      (os.path.join(__PATH_FOLDER__, 'hunian.json'),      orient='records', lines=False, indent=4)
    rumah.to_json       (os.path.join(__PATH_FOLDER__, 'rumah.json'),       orient='records', lines=False, indent=4)
    apartemen.to_json   (os.path.join(__PATH_FOLDER__, 'apartemen.json'),   orient='records', lines=False, indent=4)
    agen.to_json        (os.path.join(__PATH_FOLDER__, 'agen.json'),        orient='records', lines=False, indent=4)
    perusahaan.to_json  (os.path.join(__PATH_FOLDER__, 'perusahaan.json'),  orient='records', lines=False, indent=4)

- main preprocessing

In [7]:
__RED__     = '\033[91m'
__GREEN__   = '\033[92m'
__RESET__   = '\033[0m'

def main_preprocessing():
    df_raw      = pd.read_json(os.path.join(__PATH_FOLDER__, 'data_raw.json'))
    df_clean    = preprocessing(df_raw)

    store(df                = df_clean,
          hunian_cols       =__HUNIAN__,
          rumah_cols        =__RUMAH__,
          apartemen_cols    =__APARTEMEN__,
          agen_cols         =__AGEN__,
          perusahaan_cols   =__PERUSAHAAN__
          )

    log_message = (
        f'{__RED__}.................................{__RESET__}\n'
        f'.................................\n'
        f'{__GREEN__}.................................{__RESET__}\n'
        '==== preprocessing completed ====\n\n'
        f'-> total raw data\t: {df_raw.shape[0]}\n'
        f'-> total data deleted\t: {__RED__}{df_raw.shape[0] - df_clean.shape[0]}{__RESET__}\n'
        f'-> total clean data\t: {__GREEN__}{df_clean.shape[0]}{__RESET__}\n'
    )
    print(log_message)

    clean_timestamps = df_clean['timestamp'].unique()

    # Menentukan path untuk file log
    log_file_path = 'log.txt'

    # Menulis nilai unik ke dalam file log dalam urutan kemunculannya
    with open(log_file_path, 'w') as log_file:
        log_file.write(
            f'-> total raw data\t\t: {df_raw.shape[0]}\n'
            f'-> total data deleted\t: {df_raw.shape[0] - df_clean.shape[0]}\n'
            f'-> total clean data\t\t: {df_clean.shape[0]}\n\n'
        )
        log_file.write("Scraping-Preprocessing Activity Log (Started At)\n")
        log_file.write("-------------------------------------------\n")
        log_file.write("     Started At    | Scraping ---> cleaned |\n")
        log_file.write("-------------------------------------------\n")
        for timestamp in clean_timestamps:
            count_raw = (df_raw['timestamp'] == timestamp).sum()
            count_clean = (df_clean['timestamp'] == timestamp).sum()
            log_file.write(f"{timestamp}| {count_raw} ---> {count_clean}\n")

In [8]:
# __MAIN
main_preprocessing()

[91m.................................[0m
.................................
[92m.................................[0m
==== preprocessing completed ====

-> total raw data	: 4012
-> total data deleted	: [91m210[0m
-> total clean data	: [92m3802[0m



- Data hunian

In [9]:
hunian = pd.read_json(os.path.join(__PATH_FOLDER__, 'hunian.json'))
hunian.head(5)

Unnamed: 0,id_iklan,tipe_properti,luas_bangunan,kamar_tidur,kamar_mandi,lokasi,sertifikat,tipe_iklan,periode_kepemilikan,harga,diperbarui,id_agen
0,aps3187118,Apartemen,112,3,2,Menteng,SHM - Sertifikat Hak Milik,jual,Pemilik,5500000000,2024-05-27,1859657
1,aps3187337,Apartemen,33,2,1,Cempaka Putih,"Lainnya (PPJB,Girik,Adat,dll)",jual,Pemilik,550000000,2024-05-09,947615
2,aps3187554,Apartemen,50,2,1,Sawah Besar,"Lainnya (PPJB,Girik,Adat,dll)",jual,Pemilik,800000000,2024-02-02,1527108
3,aps3187588,Apartemen,79,1,1,Kemayoran,SHM - Sertifikat Hak Milik,jual,Pemilik,2400000000,2024-02-02,48760
4,hos16255795,Rumah,80,3,3,Gunung Sahari,SHM - Sertifikat Hak Milik,jual,Pemilik,1000000000,2024-02-02,1356696


- Data rumah

In [10]:
rumah = pd.read_json(os.path.join(__PATH_FOLDER__, 'rumah.json'))
rumah.head(5)

Unnamed: 0,id_iklan,luas_tanah,carport,taman
0,hos16255795,49,1,Tidak
1,hos16255835,400,2,Tidak
2,hos16251736,378,0,Ya
3,hos16252133,376,1,Tidak
4,hos16252488,515,0,Tidak


- Data apartemen

In [11]:
apartemen = pd.read_json(os.path.join(__PATH_FOLDER__, 'apartemen.json'))
apartemen.head(5)

Unnamed: 0,id_iklan,kondisi_properti,kondisi_perabotan
0,aps3187118,Bagus,Furnished
1,aps3187337,Bagus,Furnished
2,aps3187554,Bagus,Furnished
3,aps3187588,Bagus,Furnished
4,aps3187135,Bagus,Furnished


- Data agen

In [12]:
agen = pd.read_json(os.path.join(__PATH_FOLDER__, 'agen.json'))
agen.head(5)

Unnamed: 0,id_agen,nama_agen,nomor_telepon,terjual,tersewa,nama_perusahaan
0,1859657,Grace,6287736666636,0,0,Xavier Marks Menteng
1,947615,Junison Siagian,6282368494901,41,61,WM PRO
2,1527108,maria lie,62811135705,3,3,Asia One Harapan Indah
3,48760,Vemby Intan,628996060307,2,0,Dream Home
4,1356696,novie asiaonehi asiaonehi,6287810000275,1,1,Asia One Harapan Indah


- Data perusahaan

In [13]:
perusahaan = pd.read_json(os.path.join(__PATH_FOLDER__, 'perusahaan.json'))
perusahaan.head(5)

Unnamed: 0,nama_perusahaan,alamat
0,Xavier Marks Menteng,"Jl. Cikini Raya No.121C, RW.4, Cikini, Kec. Me..."
1,WM PRO,"Apartemen Taman Rasuna Kawasan Epicentrum,Kuni..."
2,Asia One Harapan Indah,"Rukan Mega Bulevard Blok RV-2 No. 6A, Medan Sa..."
3,Dream Home,Ruko Alexandrite Blok ALX 3 No. 17 Gading Serp...
4,ThayProperty,"Jl. Pondok Kelapa, Duren Sawit, Jakarta Timur"
