# Imports

In [28]:
import numpy as np
import pandas as pd

Initialize functions

In [32]:
def categorize(df, categories):
    for category in categories:
        df[category] = pd.Categorical(df[category]).codes
    return df

def numberize(df, numerics):
    for numeric in numerics:
        df[numeric] = df[numeric].astype("float")
    return df

def clean_year(year):
    return 2024 - year

# Function to clean year_built values
def clean_year_built(year):
    try:
        # Attempt to convert the value to an integer
        year = int(float(year))  # Handles both integer and float strings
        year_str = str(year)

        # Ensure the string is exactly 4 characters long
        if len(year_str) > 4:
            return int(year_str[:4])
        elif len(year_str) < 4:
            return int(year_str.ljust(4, '0'))  # Pads with zeros if less than 4 digits
        else:
            return year
    except (ValueError, TypeError):
        # Return NaN for non-numeric values or None
        return pd.NA

load and prepare the data

In [38]:
df = pd.read_csv("flatfox.csv", low_memory=False)
df.drop(df[df["offer_type"] != "RENT"].index, inplace=True)
# remove columns with wrong year
df.drop(df[df["year_built"] < 1800].index, inplace=True)
df.drop(df[df["year_built"] > 2030].index, inplace=True)
# NaN  in floors is replaced by 0
# year_built has unrealistic values (for example 19 Million)
#df['year_built'] = df['year_built'].apply(clean_year_built)

df.fillna({"year_built": df["year_built"].mean(), "floor": 0.0, "year_renovated": df["year_built"], "number_of_rooms": 0.0, "livingspace": 0.0}, inplace=True)

df.loc[df["year_renovated"] < 1800, "year_renovated"] = df["year_built"]

df["year_renovated"] = df["year_renovated"].apply(clean_year)
df["year_built"] = df["year_built"].apply(clean_year)


df.dropna(subset=["price_display"], inplace=True)
df.drop([
    "pk",
    "slug",
    "url",
    "short_url",
    "moving_date",
    "rent_net",
    "rent_charges",
    "rent_gross",
    "offer_type",
    "submit_url",
    "status",
    "created",
    "reference",
    "ref_property",
    "ref_house",
    "ref_object",
    "alternative_reference",
    "published",
    "short_title",
    "public_title",
    "pitch_title",
    "description_title",
    "description",
    "attributes",
    "public_address",
    "video_url",
    "tour_url",
    "website_url",
    "live_viewing_url",
    "cover_image",
    "images",
    "documents",
    "agency",
    "rent_title",
    "surface_property",
    "surface_living",
    "surface_usable",
    "surface_usable_minimum",
    "volume",
    "space_display",
    "street"], axis=1, inplace=True)

categories = [
    "object_category",
    "object_type",
    "price_display_type",
    "price_unit",
    "city",
    "moving_date_type"
]
numerics = [
    "is_furnished",
    "is_temporary",
    "is_selling_furniture",
    "reserved"
]
categorize(df, categories)
numberize(df, numerics)
df.describe()
#df.head()
#df.isna().sum()
#df.dtypes

Unnamed: 0,object_category,object_type,price_display,price_display_type,price_unit,number_of_rooms,floor,is_furnished,is_temporary,is_selling_furniture,zipcode,city,latitude,longitude,year_built,year_renovated,moving_date_type,reserved,livingspace
count,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0,20713.0
mean,2.306185,19.433496,1581.338387,0.953556,1.139526,1.927027,1.298846,0.14305,0.065273,0.023801,5563.757592,1120.005987,47.159021,8.024401,38.998547,31.567108,1.052914,0.000338,81.059962
std,2.741081,18.112954,1534.722897,0.21045,0.632149,1.83906,2.109293,0.350133,0.247013,0.152434,2743.831775,683.648115,0.404267,0.847758,21.681407,19.387146,0.802601,0.018381,673.658569
min,0.0,0.0,1.0,0.0,0.0,0.0,-8.0,0.0,0.0,0.0,1000.0,0.0,45.826182,5.991881,-2.0,-3.0,0.0,0.0,0.0
25%,0.0,2.0,398.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3097.0,500.0,46.956562,7.467781,38.946387,15.0,0.0,0.0,0.0
50%,0.0,16.0,1460.0,1.0,1.0,1.5,1.0,0.0,0.0,0.0,5430.0,1101.0,47.346442,8.207061,38.946387,38.946387,1.0,0.0,58.0
75%,5.0,37.0,2141.0,1.0,1.0,3.5,2.0,0.0,0.0,0.0,8152.0,1732.0,47.430642,8.587571,38.946387,38.946387,2.0,0.0,94.0
max,8.0,58.0,62539.0,1.0,4.0,10.5,31.0,1.0,1.0,1.0,9657.0,2157.0,47.793652,10.364311,224.0,224.0,2.0,1.0,90000.0


count all years: 21040

count 1500-2024: 20725

count 1800-2024: 20672


Write clean data to csv

In [39]:
df.to_csv("flatfox_clean.csv", index=False)

Create profiling report

In [40]:
from ydata_profiling import ProfileReport
ProfileReport(pd.read_csv("flatfox.csv", low_memory=False).drop(["created", "published"], axis=1), title="Flatfox Profiling").to_file("flatfox_profiling.html")
ProfileReport(pd.read_csv("flatfox_clean.csv", low_memory=False), title="Flatfox Profiling").to_file("flatfox_clean_profiling.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'agr'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]