In [71]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

In [72]:
#   load data
df = pd.read_csv("..\\data\\02_cleanData_properties_data.csv", index_col="ID")
df.shape

(18348, 40)

In [73]:
df.drop_duplicates(inplace=True)
df.shape

(18124, 40)

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18124 entries, 10616910 to 10616831
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   type                            18124 non-null  object 
 1   subtype                         18124 non-null  object 
 2   region                          18124 non-null  object 
 3   province                        18124 non-null  object 
 4   district                        18124 non-null  object 
 5   locality                        18124 non-null  object 
 6   localityType                    18124 non-null  int64  
 7   postalCode                      18124 non-null  int64  
 8   address                         14202 non-null  object 
 9   latitude                        14653 non-null  float64
 10  longitude                       14653 non-null  float64
 11  condition                       13431 non-null  object 
 12  constructionYear           

In [75]:
columns_to_take = [
    "type", "region", "province", "postalCode", "localityType", 
    "bedroomCount", "netHabitableSurface", "condition", "epcScore",
    "bathroomCount", "showerRoomCount", "toiletCount", 
    "hasLift", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning", 
    "hasGarden", "hasTerrace", "gardenSurface", "terraceSurface", "land",
    "price" 
    ] 
df = df[columns_to_take]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18124 entries, 10616910 to 10616831
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   type                 18124 non-null  object 
 1   region               18124 non-null  object 
 2   province             18124 non-null  object 
 3   postalCode           18124 non-null  int64  
 4   localityType         18124 non-null  int64  
 5   bedroomCount         18124 non-null  float64
 6   netHabitableSurface  16313 non-null  float64
 7   condition            13431 non-null  object 
 8   epcScore             14451 non-null  object 
 9   bathroomCount        15972 non-null  float64
 10  showerRoomCount      18124 non-null  float64
 11  toiletCount          13910 non-null  float64
 12  hasLift              12198 non-null  object 
 13  fireplaceExists      18124 non-null  bool   
 14  hasSwimmingPool      9378 non-null   object 
 15  hasAirConditioning   2142 non-n

In [76]:
df.isna().sum()

type                       0
region                     0
province                   0
postalCode                 0
localityType               0
bedroomCount               0
netHabitableSurface     1811
condition               4693
epcScore                3673
bathroomCount           2152
showerRoomCount            0
toiletCount             4214
hasLift                 5926
fireplaceExists            0
hasSwimmingPool         8746
hasAirConditioning     15982
hasGarden              12411
hasTerrace              6589
gardenSurface          14591
terraceSurface         10886
land                    8954
price                      0
dtype: int64

In [77]:
df.groupby("localityType")["type"].count()

localityType
1    1460
2    2812
3    4483
4    9369
Name: type, dtype: int64

In [78]:
df.drop(df[df["bedroomCount"] > 12].index, inplace=True)
df.groupby("bedroomCount")["type"].count()

bedroomCount
1.0     2110
2.0     6057
3.0     5843
4.0     2506
5.0      917
6.0      368
7.0      132
8.0       77
9.0       18
10.0      34
11.0      12
12.0      18
Name: type, dtype: int64

In [79]:
df.dropna(subset=['netHabitableSurface'], inplace=True)
df["netHabitableSurface"].isna().sum()

0

In [80]:
df["condition"].fillna(value="UNKNOWN", inplace=True)
df.groupby("condition")["type"].count()

condition
AS_NEW            3706
GOOD              4900
JUST_RENOVATED     955
TO_BE_DONE_UP     1277
TO_RENOVATE       1311
TO_RESTORE          78
UNKNOWN           4065
Name: type, dtype: int64

In [81]:
df["bathroomCount"].fillna(value=0, inplace=True)
df.loc[df["showerRoomCount"] < 1, "showerRoomCount"] = 0
df.loc[df["showerRoomCount"] > 14, "showerRoomCount"] = 14
df["toiletCount"].fillna(value=0, inplace=True)

In [82]:
# df.groupby("bathroomCount")["type"].count()
# df.groupby("showerRoomCount")["type"].count()
# df.groupby("toiletCount")["type"].count()

In [83]:
binary_to_fix = ["hasLift", "hasGarden", "hasTerrace", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning"]
for btf in binary_to_fix:
    df[btf].fillna(value=False, inplace=True)
    df[btf] = df[btf].apply(lambda x: 1 if x == True else 0)

In [84]:
# df.groupby("hasAirConditioning")["type"].count()

In [85]:
df[["gardenSurface", "terraceSurface"]].isna().sum()

gardenSurface     13034
terraceSurface     9462
dtype: int64

In [86]:
df.loc[df["gardenSurface"] < 4, "gardenSurface"] = 0
df.loc[df["gardenSurface"] > 1000, "gardenSurface"] = 1000
df["gardenSurface"].fillna(value=0, inplace=True)
df.groupby("gardenSurface")["type"].count()

gardenSurface
0.0       13116
4.0           2
5.0           8
6.0           6
7.0           4
          ...  
974.0         1
975.0         1
978.0         1
998.0         1
1000.0      408
Name: type, Length: 528, dtype: int64

In [87]:
df.loc[df["terraceSurface"] > 100, "terraceSurface"] = 100
df["terraceSurface"].fillna(value=0, inplace=True)
df.groupby("terraceSurface")["type"].count()

terraceSurface
0.0      9462
1.0       117
2.0       189
3.0       196
4.0       255
         ... 
93.0        2
95.0       12
96.0        2
98.0        2
100.0     188
Name: type, Length: 96, dtype: int64

In [88]:
df["epcScore"].fillna(value="UNKNOWN", inplace=True)
df.groupby("epcScore")["type"].count()

epcScore
A          1261
B          3149
C          2653
D          2124
E          1464
F          1647
G           927
UNKNOWN    3067
Name: type, dtype: int64

In [89]:
df.loc[df["land"] < 10, "land"] = 0
df["land"].fillna(value=0, inplace=True)
df.groupby("land")["type"].count()

land
0.0          8984
20.0            3
24.0            1
25.0            1
27.0            4
             ... 
114000.0        1
148000.0        1
160737.0        2
291294.0        1
1090481.0       1
Name: type, Length: 1995, dtype: int64

In [90]:
new_order = [
    "type", "region", "province", "postalCode", "localityType", 
    "bedroomCount", "netHabitableSurface", "condition", "epcScore",
    "bathroomCount", "showerRoomCount", "toiletCount", 
    "hasLift", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning", 
    "hasGarden", "hasTerrace", "gardenSurface", "terraceSurface", "land",
    "price"
    ]
df = df[new_order]

In [91]:
df.shape

(16292, 22)

In [92]:
df.isna().sum()

type                   0
region                 0
province               0
postalCode             0
localityType           0
bedroomCount           0
netHabitableSurface    0
condition              0
epcScore               0
bathroomCount          0
showerRoomCount        0
toiletCount            0
hasLift                0
fireplaceExists        0
hasSwimmingPool        0
hasAirConditioning     0
hasGarden              0
hasTerrace             0
gardenSurface          0
terraceSurface         0
land                   0
price                  0
dtype: int64

In [93]:
#   save new csv
path_to_save = "..\\data\\04_ml_prep_data_drop_duplicate.csv"
df.to_csv(path_to_save, index=True)