# Imports

In [1]:
import numpy as np
import pandas as pd

Initialize functions

In [2]:
def categorize(df, categories):
    for category in categories:
        df[category] = pd.Categorical(df[category]).codes
    return df

def numberize(df, numerics):
    for numeric in numerics:
        df[numeric] = df[numeric].astype("float")
    return df

def clean_year(year):
    return 2024 - year

# Function to clean year_built values
def clean_year_built(year):
    try:
        # Attempt to convert the value to an integer
        year = int(float(year))  # Handles both integer and float strings
        year_str = str(year)

        # Ensure the string is exactly 4 characters long
        if len(year_str) > 4:
            return int(year_str[:4])
        elif len(year_str) < 4:
            return int(year_str.ljust(4, '0'))  # Pads with zeros if less than 4 digits
        else:
            return year
    except (ValueError, TypeError):
        # Return NaN for non-numeric values or None
        return pd.NA
    

load and prepare the data

In [23]:
df = pd.read_csv("flatfox.csv", low_memory=False)
df.drop(df[df["offer_type"] != "RENT"].index, inplace=True)
relevant_cat = ["APARTMENT", "HOUSE"]
df.drop(df[~df["object_category"].isin(relevant_cat)].index, inplace=True)
# remove columns with wrong year
df.drop(df[df["year_built"] < 1800].index, inplace=True)
df.drop(df[df["year_built"] > 2030].index, inplace=True)
# NaN  in floors is replaced by 0
# year_built has unrealistic values (for example 19 Million)
#df['year_built'] = df['year_built'].apply(clean_year_built)

df.fillna({"year_built": df["year_built"].mean(), "floor": 0.0, "year_renovated": df["year_built"], "number_of_rooms": 0.0, "livingspace": 0.0}, inplace=True)

df.loc[df["year_renovated"] < 1800, "year_renovated"] = df["year_built"]

df["year_renovated"] = df["year_renovated"].apply(clean_year)
df["year_built"] = df["year_built"].apply(clean_year)

df.dropna(subset=["price_display"], inplace=True)
df.drop([
    "pk",
    "slug",
    "url",
    "short_url",
    "moving_date",
    "rent_net",
    "rent_charges",
    "rent_gross",
    "offer_type",
    "submit_url",
    "status",
    "created",
    "reference",
    "ref_property",
    "ref_house",
    "ref_object",
    "alternative_reference",
    "published",
    "short_title",
    "public_title",
    "pitch_title",
    "description_title",
    "description",
    "attributes",
    "public_address",
    "video_url",
    "tour_url",
    "website_url",
    "live_viewing_url",
    "cover_image",
    "images",
    "documents",
    "agency",
    "rent_title",
    "surface_property",
    "surface_living",
    "surface_usable",
    "surface_usable_minimum",
    "volume",
    "space_display",
    "street"], axis=1, inplace=True)

categories = [
    "object_category",
    "object_type",
    "price_display_type",
    "price_unit",
    "city",
    "moving_date_type"
]
numerics = [
    "is_furnished",
    "is_temporary",
    "is_selling_furniture",
    "reserved"
]

categorize(df, categories)
numberize(df, numerics)
df.describe()
#df.head()
#df.isna().sum()
#df.dtypes

   object_category  object_type  price_display price_display_type price_unit  \
3        APARTMENT  SINGLE_ROOM          610.0              TOTAL    monthly   
5        APARTMENT    APARTMENT         2370.0              TOTAL    monthly   
23       APARTMENT    APARTMENT         3104.0              TOTAL    monthly   
24       APARTMENT    APARTMENT         4246.0              TOTAL    monthly   
25       APARTMENT    APARTMENT         5076.0              TOTAL    monthly   

    number_of_rooms  floor  is_furnished  is_temporary  is_selling_furniture  \
3               1.0    0.0          True         False                 False   
5               2.5    0.0         False         False                 False   
23              0.0    0.0         False         False                 False   
24              1.0    0.0         False         False                 False   
25              2.0    0.0         False         False                 False   

    zipcode        city   latitude  lo

Unnamed: 0,object_category,object_type,price_display,price_display_type,price_unit,number_of_rooms,floor,is_furnished,is_temporary,is_selling_furniture,zipcode,city,latitude,longitude,year_built,year_renovated,moving_date_type,reserved,livingspace
count,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0,11724.0
mean,0.028233,1.771836,2122.884766,0.999829,0.000171,3.26241,1.99002,0.164961,0.072501,0.027721,5372.672637,923.062692,47.147571,7.993014,39.471739,29.888156,1.016888,0.000597,73.392613
std,0.165644,3.899079,1087.901432,0.01306,0.01306,1.322995,2.159625,0.371161,0.259326,0.164179,2696.155419,576.256038,0.413635,0.841675,23.461238,20.711948,0.727863,0.024429,48.428886
min,0.0,0.0,1.0,0.0,0.0,0.0,-5.0,0.0,0.0,0.0,1000.0,0.0,45.826182,5.991881,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,0.0,1480.0,1.0,0.0,2.5,1.0,0.0,0.0,0.0,3072.0,412.0,46.950762,7.445634,39.479863,9.0,0.0,0.0,47.0
50%,0.0,0.0,1870.0,1.0,0.0,3.5,2.0,0.0,0.0,0.0,4833.0,892.0,47.318822,7.909561,39.479863,39.479863,1.0,0.0,74.0
75%,0.0,0.0,2500.0,1.0,0.0,4.5,3.0,0.0,0.0,0.0,8052.0,1422.0,47.434064,8.573304,39.479863,39.479863,2.0,0.0,97.0
max,1.0,19.0,15000.0,1.0,1.0,10.5,31.0,1.0,1.0,1.0,9657.0,1812.0,47.768032,10.364311,224.0,224.0,2.0,1.0,1275.0


count all years: 21040

count 1500-2024: 20725

count 1800-2024: 20672


Write clean data to csv

In [24]:
df.to_csv("flatfox_no_parking_clean.csv", index=False)

Create profiling report

In [None]:
from ydata_profiling import ProfileReport
ProfileReport(pd.read_csv("flatfox.csv", low_memory=False).drop(["created", "published"], axis=1), title="Flatfox raw Profiling").to_file("flatfox_profiling.html")
ProfileReport(pd.read_csv("flatfox_clean.csv", low_memory=False), title="Flatfox clean Profiling").to_file("flatfox_clean_profiling.html")
ProfileReport(pd.read_csv("flatfox_no_parking_clean.csv", low_memory=False), title="Flatfox clean apartment Profiling").to_file("flatfox_no_parking_clean_profiling.html")