In [14]:
import pandas as pd
import numpy as np
import re

In [15]:
df = pd.read_csv('../../data/processed/realestates_kh_v2_3.csv', encoding='latin1')

In [16]:
df['land_area'] = df['land_area'].replace(0.0, np.nan).replace('', np.nan)

In [17]:
df.isnull().sum()

Unnamed: 0                2
id                        1
headline                  1
price_display            16
rent_display           3513
bedrooms                943
bathrooms               950
land_area               617
thumbnail_url             2
thumbnail_urls            2
location                  2
address_subdivision       2
address_locality          2
address_line_2           12
address_line_1          149
category_name             2
is_parent                 3
type                      2
information              39
source_url                2
language                  2
latitude                  2
longitude                 3
dtype: int64

In [18]:
# def extract_land_area(text):
#     if pd.isna(text):
#         return None
    
#     # Step 1: Direct land area extraction (e.g., "100 sqm")
#     direct_pattern = r'(\d+\.?\d*)\s*(?:sqm|m²|m2|square\s*meters?)\b'
#     direct_match = re.search(direct_pattern, text, re.IGNORECASE)
#     if direct_match:
#         return float(direct_match.group(1))
    
#     # Step 2: Extract land dimensions (e.g., "4m x 20m") with contextual keywords
#     dim_pattern = r'(\d+\.?\d*)\s*(?:m\s*)?[x*]\s*(\d+\.?\d*)\s*(?:m\b)'
#     dim_matches = list(re.finditer(dim_pattern, text, re.IGNORECASE))
    
#     for match in dim_matches:
#         start, end = match.span()
#         window = text[max(0, start-30):min(len(text), end+30)].lower()
#         # Check for contextual keywords near the dimensions
#         if re.search(r'\b(?:land|plot|ground|property|size)\b', window):
#             length = float(match.group(1))
#             width = float(match.group(2))
#             return length * width
    
#     # Step 3: Fallback to the first dimension pattern found (if any)
#     if dim_matches:
#         length = float(dim_matches[0].group(1))
#         width = float(dim_matches[0].group(2))
#         return length * width
    
#     return None

In [19]:
def extract_land_area(text):
    if pd.isna(text):
        return None
    def parse_float_number(num_str):
        if num_str is None:
            return None
        return float(num_str.replace(',', '.'))
    number_pattern = r'\d+(?:[.,]\d*)?'

    # Step 1: Direct land area extraction (e.g., "100 sqm", "9,2 m2")
    direct_pattern = rf'({number_pattern})\s*(?:sqm|m²|m2|square\s*meters?)\b'
    direct_match = re.search(direct_pattern, text, re.IGNORECASE)
    if direct_match:
        extracted_area = parse_float_number(direct_match.group(1))
        # Apply the condition: if extracted_area < 30, return None
        if extracted_area is not None and extracted_area < 30:
            return None
        return extracted_area
    
    # Step 2: Extract land dimensions (e.g., "4m x 20m", "9,5m*10m") with contextual keywords
    dim_pattern = rf'({number_pattern})\s*(?:m\s*)?[x*]\s*({number_pattern})\s*(?:m\b)'
    dim_matches = list(re.finditer(dim_pattern, text, re.IGNORECASE))
    
    for match in dim_matches:
        start, end = match.span()
        window = text[max(0, start-30):min(len(text), end+30)].lower()
        
        if re.search(r'\b(?:land|plot|ground|property|size)\b', window):
            length = parse_float_number(match.group(1))
            width = parse_float_number(match.group(2))
            
            if length is not None and width is not None:
                extracted_area = length * width
                # Apply the condition: if extracted_area < 30, return None
                if extracted_area < 30:
                    return None
                return extracted_area
    
    # Step 3: Fallback to the first dimension pattern found (if any)
    if dim_matches:
        length = parse_float_number(dim_matches[0].group(1))
        width = parse_float_number(dim_matches[0].group(2))
        
        if length is not None and width is not None:
            extracted_area = length * width
            # Apply the condition: if extracted_area < 30, return None
            if extracted_area < 30:
                return None
            return extracted_area
    
    return None

In [20]:
rows_to_reprocess = df['land_area'].apply(
    lambda x: pd.isna(x) or (pd.api.types.is_numeric_dtype(x) and (x == 0 or x < 30))
)

# Iterate only over the rows that meet the condition
for idx, row in df[rows_to_reprocess].iterrows():
    extracted_area = extract_land_area(row['information'])
    
    if extracted_area is not None:
        df.at[idx, 'land_area'] = extracted_area
    else:
        df.at[idx, 'land_area'] = np.nan

In [21]:
df.isnull().sum()

Unnamed: 0                2
id                        1
headline                  1
price_display            16
rent_display           3513
bedrooms                943
bathrooms               950
land_area               615
thumbnail_url             2
thumbnail_urls            2
location                  2
address_subdivision       2
address_locality          2
address_line_2           12
address_line_1          149
category_name             2
is_parent                 3
type                      2
information              39
source_url                2
language                  2
latitude                  2
longitude                 3
dtype: int64

In [22]:
df.to_csv('../../data/processed/realestates_kh_v2_3_1.csv', index=False)