# initial cleanup

In [86]:
import pandas as pd
import numpy as np
import re

In [90]:
#Pipeline functions

def cl_price(df):
    #price
    print("clean price")
    df['price'] = df['price'].str.replace(r"[^\d]", "", regex=True)
    df["price"] = df["price"].replace("", np.nan).astype("Int64")

    return df

def cl_ppm(df):
    #ppm
    print("clean ppm")
    df['price_per_meter'] = df['price_per_meter'].str.replace(r"[^\d]", "", regex=True)

    return df

def cl_floor(df):
    #floor
    print("clean floor")
    df["floor"] = df["floor"].str.split("/").str[0]
    df.loc[df["floor"].astype(str).str.lower().str.contains("parter", na=False), "floor"] = 0

    return df

def cl_furnished(df):
    #furnished
    print("clean furnished")
    df["furnished"] = df["furnished"].replace({"Tak": 1, "Nie": 0}).astype("Int64")

    return df

def cl_adress(df):
    #adress
    print("clean adress")
    address_parts = df["district"].str.split(",", expand=True)
    
    address_parts = address_parts.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    mask = address_parts.iloc[:, 2].str.lower() == "mazowieckie"
    address_parts.loc[mask, [4, 3, 2, 1]] = address_parts.loc[mask, [3, 2, 1, 0]].values
    
    del address_parts[4]
    address_parts.columns = ["street", "neighbourhood", "district", "voivodeship"]
    address_parts
    df = df.join(address_parts, rsuffix="_new")
    
    del df['district']

    return df

In [94]:
data_oto = pd.read_csv("../data/raw/raw_otodom_selenium_23032025211344.csv", sep=';')

def oto_cleanup_pipeline(df):
    df = cl_price(df)
    df = cl_ppm(df)
    df = cl_floor(df)
    df = cl_furnished(df)
    df = cl_adress(df)

    return df

data_oto = oto_cleanup_pipeline(data_oto)
data_oto.head(5)

clean price
clean ppm
clean floor
clean furnished
clean adress


  df["furnished"] = df["furnished"].replace({"Tak": 1, "Nie": 0}).astype("Int64")
  address_parts = address_parts.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,source,price,price_per_meter,area,rooms,floor,market_type,furnished,description,date,url,title,street,neighbourhood,district_new,voivodeship
0,otodom,1409000.0,17617.0,79.98,4.0,0.0,pierwotny,1,4-pokojowe mieszkanie numer 4 na parterze w bu...,2025-03-23,https://www.otodom.pl/pl/oferta/4-pokojowe-mie...,4-pokojowe mieszkanie 79m2 + ogródek,ul. Gratyny,Kępa Zawadowska,Wilanów,mazowieckie
1,otodom,1495000.0,21083.0,70.91,3.0,3.0,wtórny,1,,2025-03-23,https://www.otodom.pl/pl/oferta/wilanow-70-91-...,"Wilanów, 70,91 m2, 3/4 pokoje, garaż, balkon",Błonia Wilanowskie,Błonia Wilanowskie,Wilanów,mazowieckie
2,otodom,685000.0,22154.0,30.92,2.0,,wtórny,1,,2025-03-22,https://www.otodom.pl/pl/oferta/2024r-sypialni...,"2024r, sypialnia, ładne osiedle. Blisko M3",ul. Podskarbińska,Grochów,Praga-Południe,mazowieckie
3,otodom,,,66.1,3.0,0.0,pierwotny,1,3-pokojowe mieszkanie numer B001 na parterze w...,2025-03-23,https://www.otodom.pl/pl/oferta/3-pokojowe-mie...,3-pokojowe mieszkanie 66m2 + ogródek Bezpośrednio,ul. Polska,Siekierki,Mokotów,mazowieckie
4,otodom,768000.0,12800.0,60.0,4.0,,pierwotny,1,,2025-03-23,https://www.otodom.pl/pl/oferta/4-pok-w-znakom...,"4 pok. w znakomitej cenie , OGRÓDEK",ul. Posag 7 Panien,Szamoty,Ursus,mazowieckie


In [95]:
data_olx = pd.read_csv("../data/raw/raw_olx_23032025155859.csv", sep=';')

def oto_cleanup_pipeline(df):
    df = cl_price(df)
    df = cl_ppm(df)
    df = cl_floor(df)
    df = cl_furnished(df)

    return df

data_olx = oto_cleanup_pipeline(data_olx)
data_olx.head(5)

clean price
clean ppm
clean floor
clean furnished


Unnamed: 0,price,price_per_meter,area,rooms,floor,market_type,furnished,building_type,description,district,date,url
0,1249000,1864179,67,4.0,10,Wtórny,,Blok,OPIS\nSPRZEDAJE WAŚCICIEL\nMieszkanie położne ...,Mokotów,2025-03-16,https://www.olx.pl/d/oferta/mokotow-4-pokoje-n...
1,699000,1294444,54,3.0,0,Wtórny,,Apartamentowiec,OPIS\nDo sprzedaży przestronne mieszkanie na O...,Białołęka,,https://www.olx.pl/d/oferta/idealne-mieszkanie...
2,378000,15120,25,1.0,1,Wtórny,,Blok,OPIS\nNa sprzedaż nowy lokal / mikro-kawalerka...,Włochy,2025-03-22,https://www.olx.pl/d/oferta/nowa-kawalerka-swi...
3,739000,1919481,3850,2.0,10,Wtórny,,Blok,OPIS\nDo sprzedania dwupokojowe mieszkanie prz...,Praga-Południe,2025-03-21,https://www.olx.pl/d/oferta/gotowe-do-wprowadz...
4,580000,1903512,3047,2.0,6,Wtórny,,Blok,OPIS\nDo sprzedania mieszkanie które świetnie ...,Wola,2025-03-22,https://www.olx.pl/d/oferta/mieszkanie-dwa-pok...
