In [15]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import sys
import datetime as dt
sys.path.append(str(Path.cwd().parent))

from src.io import read_json

In [4]:
PATH_DATA = Path(Path.cwd().parent / "data" / "2026-01-29" / "urls" / "raw")
data_ap = read_json(Path(PATH_DATA, "apartments.json"))
data_h = read_json(Path(PATH_DATA, "houses.json"))
data_l = read_json(Path(PATH_DATA, "land.json"))

2026-01-31 14:13:25 - io - INFO - read_json:31 - Successfully loaded 21130
2026-01-31 14:13:25 - io - INFO - read_json:31 - Successfully loaded 5711
2026-01-31 14:13:25 - io - INFO - read_json:31 - Successfully loaded 5308


In [95]:
df_ap = pd.DataFrame(data_ap)
df_h = pd.DataFrame(data_h)
df_l = pd.DataFrame(data_l)

In [96]:
exchange_to_eur = {
    "€": 1,
    "$": 0.84,
    "MDL": 0.05,
    "lei": 0.05,
}

def get_currency(s: pd.Series) -> pd.Series:
    s = s.astype("string")
    has_slash = s.str.contains("/", regex=False, na=False).any()
    if has_slash:
        out = s.str.split().str[-1].str.split("/", regex=False).str[0]
    else:
        out = s.str.split().str[-1]
    return out


def get_price_eur(p: pd.Series, c: pd.Series, e: dict) -> pd.Series:
    str_p = p.astype("string").str.split().str[:-1].str.join("").str.strip()
    num_p = pd.to_numeric(str_p, errors="coerce")
    converted = num_p * c.map(e)
    return converted

In [None]:
df_ap = df_ap.dropna(subset=["url"]).reset_index(drop=True)
df_ap["currency"] = get_currency(df_ap["price"])
df_ap["currency_m2"] = get_currency(df_ap["square_m2"])
df_ap["price"] = get_price_eur(p=df_ap["price"], c=df_ap["currency"], e=exchange_to_eur)
df_ap["price_m2"] = get_price_eur(p=df_ap["square_m2"], c=df_ap["currency_m2"], e=exchange_to_eur)
df_ap["url"] = df_ap["url"].astype("string").str.split("?", n=1).str[0]
s = df_ap["date"].astype("string").str.lower()

month_map = {
    "ian.": "01", "feb.": "02", "mar.": "03", "apr.": "04",
    "mai": "05", "iun.": "06", "iul.": "07", "aug.": "08",
    "sept.": "09", "oct.": "10", "nov.": "11", "dec.": "12",
}

# replace month name with number
for k, v in month_map.items():
    s = s.str.replace(k, v, regex=False)

# now parse: "29 01 2026, 21:20"
df_ap["date"] = pd.to_datetime(s, format="%d %m %Y, %H:%M", errors="coerce")
df_ap = df_ap.drop(columns=["square_m2"])
df_ap = df_ap.drop_duplicates(subset=["url"]).drop_duplicates(subset=["description", "price", "price_m2"]).reset_index(drop=True)
df_ap = df_ap[df_ap["price"] > 3000]
df_ap.head()

Unnamed: 0,page,url,description,price,date,currency,currency_m2,price_m2
0,2,https://999.md/ro/103180455,"Apartament cu 2 camere, Centru, Durlești, Chiș...",89900.0,2026-01-29 21:20:00,€,€,1198.0
1,2,https://999.md/ro/103311500,"Apartament cu 4 camere, Ciocana, Chișinău, Chi...",110900.0,2026-01-29 21:19:00,€,€,1336.0
2,2,https://999.md/ro/103257928,"Apartament cu 2 camere, Centru, Chișinău, Chiș...",124900.0,2026-01-29 21:19:00,€,€,1759.0
3,2,https://999.md/ro/103350202,"Apartament cu 1 cameră, Buiucani, Chișinău, Ch...",89500.0,2026-01-29 21:19:00,€,€,2355.0
4,2,https://999.md/ro/102799482,"Apartament cu 1 cameră, Telecentru, Chișinău, ...",74900.0,2026-01-29 21:19:00,€,€,1560.0


In [98]:
df_ap.shape

(13512, 8)