In [113]:
from pandas import read_csv

from scripts.cleaning_utils import drop_outliers

In [114]:
dataframe = read_csv("../data/raw/houses_madrid.csv")

# Handle nulls values 
- We will drop the column if there is more than 50% of missing values or the columns are irrelevant with beetween 10% and 50% missing values
- We will drop the observations if they are less than 10% of missing values  
*Note that every null value must be consider as "Unknown" and not False for boolean columns*

In [115]:
dataframe_observations = dataframe.shape[0]

for column in dataframe.columns:
    if dataframe[column].isnull().sum() >  dataframe_observations * 0.5:
        dataframe.drop(columns=[column], inplace=True)
        print(f"Drop column : {column}")

Drop column : sq_mt_useful
Drop column : n_floors
Drop column : sq_mt_allotment
Drop column : latitude
Drop column : longitude
Drop column : street_number
Drop column : portal
Drop column : door
Drop column : rent_price_by_area
Drop column : built_year
Drop column : are_pets_allowed
Drop column : has_garden
Drop column : has_pool
Drop column : has_terrace
Drop column : has_balcony
Drop column : has_storage_room
Drop column : is_furnished
Drop column : is_kitchen_equipped
Drop column : is_accessible
Drop column : has_green_zones
Drop column : has_private_parking
Drop column : has_public_parking
Drop column : is_parking_included_in_price
Drop column : parking_price


In [116]:
dataframe.isnull().sum().sort_values(ascending=False)

has_ac                     10531
is_orientation_east        10384
is_orientation_south       10384
is_orientation_west        10384
is_orientation_north       10384
has_fitted_wardrobes        8343
has_central_heating         8134
has_individual_heating      8134
street_name                 5905
raw_address                 5465
is_exterior                 3043
floor                       2607
has_lift                    2386
is_floor_under              1170
is_new_development           992
house_type_id                391
sq_mt_built                  126
n_bathrooms                   16
energy_certificate             0
has_parking                    0
Unnamed: 0                     0
is_renewal_needed              0
is_buy_price_known             0
id                             0
buy_price                      0
is_rent_price_known            0
rent_price                     0
operation                      0
neighborhood_id                0
is_exact_address_hidden        0
n_rooms   

# Analysis columns with nulls values left
As we can see, there is still **18 columns** with null values.
- **is_oriented_** columns are not relevant in a regression models, with arround 35% of nulls values : **drop**
- **has_** columns could be relevant but with arround 20% of nulls values, and they could not really be completed by median (but by KNeighbor), it will be too much effort for poor resutls : **drop**
- **stree_name**, **raw_address**, etc... and all string columns : they are only relevant for specific analysis and not for global analysis or regression model : **drop**
- For the columns left, with only arround 5% maximum of nulls values, we can drop the observations
NB : **n_bathrooms** and **sq_mt_built** have less than 10 missing values, so we replace it by the median too minimize performance impact on the model 

In [117]:
DROP_COLUMNS = ["is_orientation_east", "is_orientation_west", "is_orientation_north", "is_orientation_south", "has_fitted_wardrobes", "has_central_heating", "has_individual_heating", "has_lift", "has_ac", "street_name", "raw_address"]

dataframe.drop(columns=DROP_COLUMNS, inplace=True)
dataframe.dropna(subset=["is_exterior", "floor", "house_type_id", "is_new_development"], inplace=True)
dataframe["n_bathrooms"] = dataframe["n_bathrooms"].fillna(value=dataframe["n_bathrooms"].median())
dataframe["sq_mt_built"] = dataframe["sq_mt_built"].fillna(value=dataframe["sq_mt_built"].median())

dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17536 entries, 0 to 21739
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               17536 non-null  int64  
 1   id                       17536 non-null  int64  
 2   title                    17536 non-null  object 
 3   subtitle                 17536 non-null  object 
 4   sq_mt_built              17536 non-null  float64
 5   n_rooms                  17536 non-null  int64  
 6   n_bathrooms              17536 non-null  float64
 7   is_exact_address_hidden  17536 non-null  bool   
 8   floor                    17536 non-null  object 
 9   is_floor_under           17536 non-null  object 
 10  neighborhood_id          17536 non-null  object 
 11  operation                17536 non-null  object 
 12  rent_price               17536 non-null  int64  
 13  is_rent_price_known      17536 non-null  bool   
 14  buy_price             

# Unusable columns 
Some columns are only there to the detail of specific observations but could not be used as global analyse or prediction model so we can drop them.  

In [118]:
print(f"The length of all the value of neighborhood_id is : {len(dataframe['neighborhood_id'].unique())}")
dataframe.drop(columns=['title', 'subtitle', 'id', 'Unnamed: 0', 'is_exact_address_hidden', 'neighborhood_id'], inplace=True)
dataframe.head(1)

The length of all the value of neighborhood_id is : 126


Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,floor,is_floor_under,operation,rent_price,is_rent_price_known,buy_price,buy_price_by_area,is_buy_price_known,house_type_id,is_renewal_needed,is_new_development,is_exterior,energy_certificate,has_parking
0,64.0,2,1.0,3,False,sale,471,False,85000,1328,True,HouseType 1: Pisos,False,False,True,D,False


# Outliers & duplicates
To avoid some performances erros, we must check that there are no outliers or duplicates in the dataframe and drop them

In [122]:
dataframe.drop_duplicates(inplace=True)



Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,rent_price,buy_price,buy_price_by_area
count,12973.0,12973.0,12973.0,12973.0,12973.0,12973.0
mean,91.489478,2.527634,1.505049,1224.289447,329619.696601,3548.812148
std,33.822943,0.877536,0.588781,488.029545,192243.574438,1431.134053
min,16.0,1.0,1.0,218.0,36000.0,706.0
25%,65.0,2.0,1.0,816.0,170000.0,2349.0
50%,85.0,3.0,1.0,1135.0,275000.0,3380.0
75%,114.0,3.0,2.0,1553.0,450000.0,4524.0
max,190.0,4.0,3.0,2457.0,879000.0,7795.0


In [123]:
COLUMNS = [column for column in dataframe.columns]
dataframe = drop_outliers(dataframe=dataframe, columns=COLUMNS, percent=90)
dataframe.describe()

Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,rent_price,buy_price,buy_price_by_area
count,12973.0,12973.0,12973.0,12973.0,12973.0,12973.0
mean,91.489478,2.527634,1.505049,1224.289447,329619.696601,3548.812148
std,33.822943,0.877536,0.588781,488.029545,192243.574438,1431.134053
min,16.0,1.0,1.0,218.0,36000.0,706.0
25%,65.0,2.0,1.0,816.0,170000.0,2349.0
50%,85.0,3.0,1.0,1135.0,275000.0,3380.0
75%,114.0,3.0,2.0,1553.0,450000.0,4524.0
max,190.0,4.0,3.0,2457.0,879000.0,7795.0
