In [446]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

### Load data

In [447]:
#   load data
df = pd.read_csv("..\\data\\02_cleanData_properties_data.csv", index_col="ID")
df.shape

(18348, 40)

### Drop duplicates

In [448]:
df.drop_duplicates(inplace=True)
df.shape

(18124, 40)

### Drop missing values

In [449]:
df.dropna(subset=['netHabitableSurface'], inplace=True)
df["netHabitableSurface"].isna().sum()

0

In [450]:
df.dropna(subset=['price'], inplace=True)
df["price"].isna().sum()

0

In [451]:
df.dropna(subset=['bedroomCount'], inplace=True)
df["bedroomCount"].isna().sum()

0

In [452]:
df.shape

(16313, 40)

### Subset data

In [453]:
"""columns_to_take = [
    "type", "region", "province", "district", "postalCode", "localityType",
    "bedroomCount", "netHabitableSurface", "condition", "epcScore",
    "bathroomCount", "showerRoomCount", "toiletCount", 
    "hasLift", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning", 
    "hasGarden", "hasTerrace", "gardenSurface", "terraceSurface", "land",
    "price" 
    ] 
df = df[columns_to_take]
df.info()"""

'columns_to_take = [\n    "type", "region", "province", "district", "postalCode", "localityType",\n    "bedroomCount", "netHabitableSurface", "condition", "epcScore",\n    "bathroomCount", "showerRoomCount", "toiletCount", \n    "hasLift", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning", \n    "hasGarden", "hasTerrace", "gardenSurface", "terraceSurface", "land",\n    "price" \n    ] \ndf = df[columns_to_take]\ndf.info()'

### Check null values

In [454]:
df.isna().sum()

type                                  0
subtype                               0
region                                0
province                              0
district                              0
locality                              0
localityType                          0
postalCode                            0
address                            3572
latitude                           3147
longitude                          3147
condition                          4075
constructionYear                   5806
price                                 0
pricePerMeter                         0
bedroomCount                          0
netHabitableSurface                   0
floor                              8036
floorCount                        11063
hasLift                            5306
facadeCount                        4386
hasGarden                         11044
gardenSurface                     13047
hasTerrace                         5624
terraceSurface                     9476


In [455]:
df.groupby("localityType")["type"].count()

localityType
1    1327
2    2600
3    3992
4    8394
Name: type, dtype: int64

### Remove/map outliers

In [456]:
df.drop(df[df["bedroomCount"] > 12].index, inplace=True)
#df.groupby("bedroomCount")["type"].count()

In [457]:
df.loc[df["showerRoomCount"] < 1, "showerRoomCount"] = 0
df.loc[df["showerRoomCount"] > 14, "showerRoomCount"] = 14

In [458]:
df.loc[df["gardenSurface"] < 4, "gardenSurface"] = 0
df.loc[df["gardenSurface"] > 1000, "gardenSurface"] = 1000

In [459]:
df.loc[df["terraceSurface"] > 100, "terraceSurface"] = 100

In [460]:
df.loc[df["land"] < 10, "land"] = 0
df.loc[df["land"] > 50000, "land"] = 50000

### Fill null values

In [461]:
df["condition"].fillna(value="UNKNOWN", inplace=True)
#df.groupby("condition")["type"].count()

In [462]:
df["bathroomCount"].fillna(value=0, inplace=True)
df["toiletCount"].fillna(value=0, inplace=True)

In [463]:
# df.groupby("bathroomCount")["type"].count()
# df.groupby("showerRoomCount")["type"].count()
# df.groupby("toiletCount")["type"].count()

In [464]:
binary_to_fix = ["hasLift", "hasGarden", "hasTerrace", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning"]
for btf in binary_to_fix:
    df[btf].fillna(value=False, inplace=True)
    df[btf] = df[btf].apply(lambda x: 1 if x == True else 0)

In [465]:
# df.groupby("hasAirConditioning")["type"].count()

In [466]:
#df[["gardenSurface", "terraceSurface"]].isna().sum()

In [467]:
df["gardenSurface"].fillna(value=0, inplace=True)
#df.groupby("gardenSurface")["type"].count()

In [468]:
df["terraceSurface"].fillna(value=0, inplace=True)
#df.groupby("terraceSurface")["type"].count()

In [469]:
df["epcScore"].fillna(value="UNKNOWN", inplace=True)
#df.groupby("epcScore")["type"].count()

In [470]:
df["land"].fillna(value=0, inplace=True)
#df.groupby("land")["type"].count()

In [471]:
new_order = [
    "type", "region", "province", "district", "postalCode", "localityType",
    "bedroomCount", "netHabitableSurface", "condition", "epcScore",
    "bathroomCount", "showerRoomCount", "toiletCount", 
    "hasLift", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning", 
    "hasGarden", "hasTerrace", "gardenSurface", "terraceSurface", "land",
    "price"
    ]
df = df[new_order]

In [472]:
df.isna().sum()

type                   0
region                 0
province               0
district               0
postalCode             0
localityType           0
bedroomCount           0
netHabitableSurface    0
condition              0
epcScore               0
bathroomCount          0
showerRoomCount        0
toiletCount            0
hasLift                0
fireplaceExists        0
hasSwimmingPool        0
hasAirConditioning     0
hasGarden              0
hasTerrace             0
gardenSurface          0
terraceSurface         0
land                   0
price                  0
dtype: int64

In [473]:
#   save new csv
path_to_save = "..\\data\\04_ml_prep_data_drop_duplicate.csv"
df.to_csv(path_to_save, index=True)