In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import os
import plotly.express as px
from scipy import stats


In [2]:
url = "https://raw.githubusercontent.com/swetajainh/immo-eliza-MAS-FN-analysis/main/data/RAW/RawData_apartment_sale.csv"
df = pd.read_csv(url, index_col = 0)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19860 entries, 0 to 19859
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   url                      19835 non-null  object 
 1   id                       19835 non-null  float64
 2   region                   19793 non-null  object 
 3   province                 19793 non-null  object 
 4   locality                 19835 non-null  object 
 5   zip_code                 19835 non-null  object 
 6   Longitude                16914 non-null  float64
 7   Latitude                 16914 non-null  float64
 8   property_type            19835 non-null  object 
 9   property_subtype         19835 non-null  object 
 10  price                    18058 non-null  float64
 11  number_rooms             18058 non-null  float64
 12  living_area              17156 non-null  float64
 13  kitchen                  18209 non-null  object 
 14  furnished                68

**Remove duplicates and empty rows**

In [3]:
# Remove duplicates
df.drop_duplicates(inplace=True)
# reset the index
df.reset_index(drop = True, inplace = True)
df.head()
df.shape

(18361, 29)

In [4]:
url1 = "https://raw.githubusercontent.com/swetajainh/immo-eliza-MAS-FN-analysis/main/data/Cleaned/cleaned_apartment.csv"
# Remove all the empty rows
rows_to_remove = []
# Iterate through each row in the DataFrame
for index, row in df.iterrows():
# Check if all values in the row are null
    if row.isnull().all():
        # If all values are null, mark the index for removal
        rows_to_remove.append(index)
# Remove the marked rows from the DataFrame
df = df.drop(rows_to_remove)
df.to_csv(url1)
# df.to_csv(r'data\cleaned_house_apartment.csv')

# df_clean = pd.read_csv(r'data\cleaned_house_apartment.csv', index_col=0, low_memory=False)
df_clean = pd.read_csv(url1, index_col=0)
df_clean


Unnamed: 0,url,id,region,province,locality,zip_code,Longitude,Latitude,property_type,property_subtype,...,surface_land,number_facades,swimming_pool,building_state,energy_type,EPC_score,EnergyConsumptionPerSqm,parking_outdoor,parking_indoor,price_per_sqm
0,https://www.immoweb.be/en/classified/house/for...,11128213.0,Flanders,East Flanders,Wetteren,9230.0,3.884853,51.011410,HOUSE,HOUSE,...,90.0,2.0,False,GOOD,GAS,B,193.0,,,2766.67
1,https://www.immoweb.be/en/classified/house/for...,11130920.0,Flanders,East Flanders,Ronse,9600.0,3.609445,50.740380,HOUSE,HOUSE,...,555.0,3.0,False,TO_BE_DONE_UP,FUELOIL,F,737.0,3.0,1.0,1596.15
2,https://www.immoweb.be/en/classified/house/for...,11119431.0,Flanders,East Flanders,Ronse,9600.0,3.613201,50.737229,HOUSE,HOUSE,...,196.0,2.0,False,AS_NEW,GAS,A,57.0,,,1514.55
3,https://www.immoweb.be/en/classified/house/for...,11132583.0,Wallonie,Liège,Neupré,4121.0,5.488573,50.548515,HOUSE,HOUSE,...,601.0,3.0,False,GOOD,FUELOIL,E,348.0,,1.0,1550.90
4,https://www.immoweb.be/en/classified/house/for...,10874763.0,Wallonie,Liège,Ans,4430.0,5.519029,50.660049,HOUSE,HOUSE,...,261.0,2.0,False,GOOD,GAS,G,577.0,,,1215.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17642,https://www.immoweb.be/en/classified/house/for...,10999114.0,Wallonie,Namur,Pussemange,5550.0,,,HOUSE,HOUSE,...,137.0,3.0,,AS_NEW,FUELOIL,E,354.0,,,1176.47
17643,https://www.immoweb.be/en/classified/house/for...,11151067.0,Wallonie,Hainaut,Ville-Pommeroeul,7322.0,,,HOUSE,HOUSE,...,3125.0,4.0,,unknown,FUELOIL,D,313.0,1.0,1.0,2601.63
17644,https://www.immoweb.be/en/classified/house/for...,11151064.0,Flanders,East Flanders,SINT-NIKLAAS,9100.0,4.163084,51.164513,HOUSE,HOUSE,...,178.0,2.0,,GOOD,GAS,D,366.0,,18.0,1971.83
17645,https://www.immoweb.be/en/classified/house/for...,11151063.0,Wallonie,Hainaut,Dour,7370.0,,,HOUSE,HOUSE,...,127.0,3.0,,TO_RESTORE,GAS,F,427.0,,,242.86


**Removing all rows without price as an dependent variable**

In [5]:
df_clean['price'].isna().value_counts()

# Remove rows where the "price" column is null
df_clean.dropna(subset=['price'], inplace=True)
df_clean

Unnamed: 0,url,id,region,province,locality,zip_code,Longitude,Latitude,property_type,property_subtype,...,surface_land,number_facades,swimming_pool,building_state,energy_type,EPC_score,EnergyConsumptionPerSqm,parking_outdoor,parking_indoor,price_per_sqm
0,https://www.immoweb.be/en/classified/house/for...,11128213.0,Flanders,East Flanders,Wetteren,9230.0,3.884853,51.011410,HOUSE,HOUSE,...,90.0,2.0,False,GOOD,GAS,B,193.0,,,2766.67
1,https://www.immoweb.be/en/classified/house/for...,11130920.0,Flanders,East Flanders,Ronse,9600.0,3.609445,50.740380,HOUSE,HOUSE,...,555.0,3.0,False,TO_BE_DONE_UP,FUELOIL,F,737.0,3.0,1.0,1596.15
2,https://www.immoweb.be/en/classified/house/for...,11119431.0,Flanders,East Flanders,Ronse,9600.0,3.613201,50.737229,HOUSE,HOUSE,...,196.0,2.0,False,AS_NEW,GAS,A,57.0,,,1514.55
3,https://www.immoweb.be/en/classified/house/for...,11132583.0,Wallonie,Liège,Neupré,4121.0,5.488573,50.548515,HOUSE,HOUSE,...,601.0,3.0,False,GOOD,FUELOIL,E,348.0,,1.0,1550.90
4,https://www.immoweb.be/en/classified/house/for...,10874763.0,Wallonie,Liège,Ans,4430.0,5.519029,50.660049,HOUSE,HOUSE,...,261.0,2.0,False,GOOD,GAS,G,577.0,,,1215.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17642,https://www.immoweb.be/en/classified/house/for...,10999114.0,Wallonie,Namur,Pussemange,5550.0,,,HOUSE,HOUSE,...,137.0,3.0,,AS_NEW,FUELOIL,E,354.0,,,1176.47
17643,https://www.immoweb.be/en/classified/house/for...,11151067.0,Wallonie,Hainaut,Ville-Pommeroeul,7322.0,,,HOUSE,HOUSE,...,3125.0,4.0,,unknown,FUELOIL,D,313.0,1.0,1.0,2601.63
17644,https://www.immoweb.be/en/classified/house/for...,11151064.0,Flanders,East Flanders,SINT-NIKLAAS,9100.0,4.163084,51.164513,HOUSE,HOUSE,...,178.0,2.0,,GOOD,GAS,D,366.0,,18.0,1971.83
17645,https://www.immoweb.be/en/classified/house/for...,11151063.0,Wallonie,Hainaut,Dour,7370.0,,,HOUSE,HOUSE,...,127.0,3.0,,TO_RESTORE,GAS,F,427.0,,,242.86


In [6]:
# Calculate the total number of missing values per column
missing_values = df.isnull().sum().sort_values()

# Calculate the total number of entries (rows) in the DataFrame
total_entries = len(df)

# Calculate the proportion of missing values per column
proportion_missing = (missing_values / total_entries) * 100

# Print the proportion of missing values per column
print("Proportion of missing values per column:")
print(proportion_missing)

# Create a histogram using Plotly Express
fig = px.bar(missing_values, x=missing_values.index, y=missing_values.values)
fig.update_layout(
    title='Histogram of Null Values per Column',
    xaxis_title='Column Name',
    yaxis_title='Number of Null Values'
)
fig.show()

Proportion of missing values per column:
url                         0.000000
id                          0.000000
locality                    0.000000
zip_code                    0.000000
property_type               0.000000
property_subtype            0.000000
region                      0.228758
province                    0.228758
EPC_score                   6.835512
kitchen                     8.278867
number_rooms                8.415033
price                       8.415033
EnergyConsumptionPerSqm     8.741830
living_area                13.115468
surface_land               13.415033
Latitude                   14.950980
Longitude                  14.950980
energy_type                22.178649
building_state             22.298475
number_facades             25.822440
terrace                    44.629630
terrace_area               61.269063
furnished                  65.604575
parking_indoor             66.868192
swimming_pool              67.935730
parking_outdoor            73.5784

**Exploring and Cleaning columns**

In [7]:
column_name = df_clean.columns
print(column_name)

def analyze_column(df_clean, column_name):
    column = df_clean[column_name]
    num_nan = column.isnull().sum()
    num_unique = column.nunique()
    unique_values = column.unique()
    percentage_nan = (num_nan / len(column)) * 100
    percentage_unique = (num_unique / len(column)) * 100
    print(f"Analysis for column '{column_name}':")
    print(f"Number of NaN values: {num_nan}")
    print(f"Number of unique values: {num_unique}")
    print(f"Percentage of NaN values: {percentage_nan:.2f}%")
    print(f"Percentage of unique values: {percentage_unique:.2f}%")
    print(f"Unique values: {unique_values}")


Index(['url', 'id', 'region', 'province', 'locality', 'zip_code', 'Longitude',
       'Latitude', 'property_type', 'property_subtype', 'price',
       'number_rooms', 'living_area', 'kitchen', 'furnished', 'fireplace',
       'terrace', 'terrace_area', 'garden', 'garden_area', 'surface_land',
       'number_facades', 'swimming_pool', 'building_state', 'energy_type',
       'EPC_score', 'EnergyConsumptionPerSqm', 'parking_outdoor',
       'parking_indoor', 'price_per_sqm'],
      dtype='object')


**Filling missing value with None for some columns**

In [8]:
df_clean.furnished = df_clean.furnished.fillna("None")
df_clean.fireplace = df_clean.fireplace.fillna("None")
df_clean.swimming_pool = df_clean.swimming_pool.fillna("None")
df_clean.parking_indoor = df_clean.parking_indoor.fillna("None")
df_clean.parking_outdoor = df_clean.parking_outdoor.fillna("None")
df_clean.terrace_area = df_clean.terrace_area.fillna("None")
df_clean.building_state = df_clean.building_state.fillna("None")
df_clean.Latitude = df.Latitude.fillna("None")
df_clean.Longitude = df.Longitude.fillna("None")
df_clean.kitchen = df.kitchen.fillna("None")
df_clean.surface_land = df.surface_land.fillna("None")

**Garden and Garden area** 

In [9]:
# Fill missing values in "Garden" column with False
df_clean['garden'] = df_clean['garden'].fillna(False)

# Change the data type of "Garden" to bool
df_clean['garden'] = df_clean['garden'].astype(bool)

# Fill missing values in 'Garden_area' column with 0
df_clean['garden_area'] = df_clean['garden_area'].fillna(0)

# Replace 'None' with 0 in the 'garden_area' and 'terrace_area' columns
df_clean['garden_area'].replace("None", 0, inplace=True)
df_clean['terrace_area'].replace("None", 0, inplace=True)



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





**Terrace**

In [10]:
# fill missing values in "Terrace" column with False
df_clean['terrace'] = df_clean['terrace'].fillna(False)

**Furnished**

In [11]:
furnished = analyze_column(df_clean, "furnished")

Analysis for column 'furnished':
Number of NaN values: 0
Number of unique values: 3
Percentage of NaN values: 0.00%
Percentage of unique values: 0.02%
Unique values: [0.0 'None' 1.0]


In [12]:
df_clean['furnished'] = df_clean['furnished'].replace(np.nan, False)
df_clean['furnished'] = df_clean['furnished'].replace(False, 0)
df_clean['furnished'] = df_clean['furnished'].replace(True, 1)
analyze_column(df_clean, 'furnished')

Analysis for column 'furnished':
Number of NaN values: 0
Number of unique values: 3
Percentage of NaN values: 0.00%
Percentage of unique values: 0.02%
Unique values: [0 'None' 1]


In [13]:
# Turn True/False into 1/0
for i in ["furnished", "terrace", "garden"]:
    print(df_clean[[i]].value_counts())
    df_clean[i] = df_clean[i].map({'True': 1, 'False': 0, '0':0, 0:0, 1:1})
    print(df_clean[[i]].value_counts())
#display(df_clean)

furnished
None         8433
0            5024
1             253
Name: count, dtype: int64
furnished
0.0          5024
1.0           253
Name: count, dtype: int64
terrace
1          8065
0          5645
Name: count, dtype: int64
terrace
1          8065
0          5645
Name: count, dtype: int64
garden
False     8637
True      5073
Name: count, dtype: int64
garden
0         8637
1         5073
Name: count, dtype: int64


**Number of facades**

In [14]:
df_clean["number_facades"].mode()
df_clean["number_facades"].value_counts()

# Fill missing values in 'number_facades' column with 2
df_clean['number_facades'].fillna(2, inplace=True)

df_clean.number_facades.value_counts()



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





number_facades
2.0    7206
4.0    3707
3.0    2785
1.0      10
5.0       2
Name: count, dtype: int64

**Surface of the land**

In [15]:
# Surface of the land has 8673 Nan entries
df_clean.surface_land.value_counts()

# Replace 'NaN' and 'None' with 0 in the 'surface_land' column
df_clean['surface_land'].replace({'NaN': 0, 'None': 0}, inplace=True)

df_clean.surface_land.value_counts()



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





surface_land
Nan      7689
0        1854
0         401
90         25
150        25
         ... 
1142        1
1011        1
494         1
981         1
61801       1
Name: count, Length: 1416, dtype: int64

**State of the building**

In [16]:
# State of the building data has 'Nan'
df_clean.building_state.value_counts()

building_state
GOOD              4934
unknown           2689
AS_NEW            2037
TO_RENOVATE       1850
TO_BE_DONE_UP     1456
JUST_RENOVATED     661
TO_RESTORE          83
Name: count, dtype: int64

In [17]:
# Replace 'None' and "Nan" by 'unknown'
df_clean['building_state'] = df_clean['building_state'].replace({"None": "unknown"})
df_clean['building_state'] = df_clean['building_state'].replace({"Nan": "unknown"})
df_clean.building_state.value_counts()

building_state
GOOD              4934
unknown           2689
AS_NEW            2037
TO_RENOVATE       1850
TO_BE_DONE_UP     1456
JUST_RENOVATED     661
TO_RESTORE          83
Name: count, dtype: int64

**Kitchen**

In [18]:
# Kitchen data has about 3978 '0' and 1520 "None"
df_clean.kitchen.value_counts()
df_clean['kitchen'] = df_clean['kitchen'].replace({"0": "unknown", "None": "unknown"})

**living area**

In [19]:
living_area = analyze_column(df_clean, "living_area")
df_clean["living_area"].value_counts()

Analysis for column 'living_area':
Number of NaN values: 0
Number of unique values: 675
Percentage of NaN values: 0.00%
Percentage of unique values: 4.92%
Unique values: [  90.  156.  165.  167.  213.  430.  170.  169.  110.  114.  155.  162.
  163.  175.  135.  204.  143.  339.  171.  267.  179.  262.  130.  203.
  140.  215.  360.   94.  134.  260.  384.  235.  120.  160.  166.   86.
  200.  230.  136.  100.  112.  147.  128.  142.  150.  261.  188.  220.
  207.  208.  202.  191.  394.  148.  137.  108.  180.  313.  190.   96.
 1200.  248.  132.   72.  131.  151.  177.   80.   75.   91.  269. 1350.
  158.  370.  106.  336.  300.  285.  198.  382.  480.  187.  196.  209.
  185.  194.  205.  228.  105.  420.  153.  231.  268.  250.   40.  255.
   84.  168.   83.  125.  102.  138.   81.  172.  161.  159.  154.   78.
  122.   97.  192.   45.  273.  547.  270.  149.  217.  334.  630.  223.
  304.  118.  232.  121.  227.  133.  117.   76.  333.  199.  197.  440.
  183.   93.   61. 1653.  7

living_area
150.0    248
120.0    206
200.0    203
160.0    183
180.0    182
        ... 
575.0      1
868.0      1
574.0      1
371.0      1
725.0      1
Name: count, Length: 675, dtype: int64

In [20]:
# Drop rows where the 'living_area' column contains NaN values
df_clean.dropna(subset=['living_area'], inplace=True)
df_clean.living_area.value_counts()

living_area
150.0    248
120.0    206
200.0    203
160.0    183
180.0    182
        ... 
575.0      1
868.0      1
574.0      1
371.0      1
725.0      1
Name: count, Length: 675, dtype: int64

**EPC_score**

In [21]:
EPC_score = analyze_column(df_clean, "EPC_score")
df_clean["EPC_score"].value_counts()

Analysis for column 'EPC_score':
Number of NaN values: 0
Number of unique values: 9
Percentage of NaN values: 0.00%
Percentage of unique values: 0.07%
Unique values: ['B' 'F' 'A' 'E' 'G' 'C' 'D' 'A+' 'A++']


EPC_score
C      2729
D      2522
B      2454
F      2240
E      1826
G      1033
A       847
A+       47
A++      12
Name: count, dtype: int64

In [22]:
df_clean["EPC_score"]=np.where(df_clean["EPC_score"].isin(['F_E', 'A_A+']), "Not specified", df_clean["EPC_score"])

In [23]:
df_clean.dropna(subset=['EPC_score'], inplace=True)
df_clean.EPC_score.value_counts()

EPC_score
C      2729
D      2522
B      2454
F      2240
E      1826
G      1033
A       847
A+       47
A++      12
Name: count, dtype: int64

**Energy Consumption per Sqm**

In [24]:
EnergyConsumptionPerSqm = analyze_column(df_clean, "EnergyConsumptionPerSqm")

Analysis for column 'EnergyConsumptionPerSqm':
Number of NaN values: 0
Number of unique values: 1115
Percentage of NaN values: 0.00%
Percentage of unique values: 8.13%
Unique values: [ 193.    737.     57.   ... 1006.   1045.     43.54]


In [25]:
df_clean.dropna(subset=['EnergyConsumptionPerSqm'], inplace=True)
df_clean.EnergyConsumptionPerSqm.value_counts()

EnergyConsumptionPerSqm
251.0     53
198.0     52
255.0     49
273.0     48
164.0     48
          ..
839.0      1
978.0      1
1304.0     1
1085.0     1
1356.0     1
Name: count, Length: 1115, dtype: int64

**Energy type**

In [26]:
energy_type = analyze_column(df_clean,"energy_type")
df_clean["energy_type"].value_counts()

Analysis for column 'energy_type':
Number of NaN values: 0
Number of unique values: 8
Percentage of NaN values: 0.00%
Percentage of unique values: 0.06%
Unique values: ['GAS' 'FUELOIL' 'PELLET' 'unknown' 'ELECTRIC' 'WOOD' 'CARBON' 'SOLAR']


energy_type
GAS         7436
FUELOIL     2829
unknown     2543
ELECTRIC     660
PELLET       162
WOOD          57
CARBON        18
SOLAR          5
Name: count, dtype: int64

In [27]:
df_clean['energy_type'] = df_clean['energy_type'].fillna("unknown")
df_clean['energy_type'] = df_clean['energy_type'].replace({"Nan": "unknown"})
df_clean["energy_type"].value_counts()

energy_type
GAS         7436
FUELOIL     2829
unknown     2543
ELECTRIC     660
PELLET       162
WOOD          57
CARBON        18
SOLAR          5
Name: count, dtype: int64

**Locality, province and zip codes**

In [28]:
# leaving province as it is
province = analyze_column(df_clean,"province")
df_clean["province"].value_counts()


Analysis for column 'province':
Number of NaN values: 2
Number of unique values: 11
Percentage of NaN values: 0.01%
Percentage of unique values: 0.08%
Unique values: ['East Flanders' 'Liège' 'Luxembourg' 'Hainaut' 'Walloon Brabant'
 'Antwerp' 'West Flanders' 'Brussels' 'Flemish Brabant' 'Limburg' 'Namur'
 nan]


province
Antwerp            2286
East Flanders      1999
West Flanders      1818
Liège              1544
Brussels           1298
Flemish Brabant    1279
Hainaut            1229
Walloon Brabant     797
Luxembourg          563
Namur               498
Limburg             397
Name: count, dtype: int64

In [29]:
# leaving locality as it is 
locality = analyze_column(df_clean, "locality")
df_clean["locality"].value_counts()

Analysis for column 'locality':
Number of NaN values: 0
Number of unique values: 2443
Percentage of NaN values: 0.00%
Percentage of unique values: 17.82%
Unique values: ['Wetteren' 'Ronse' 'Neupré' ... 'Eggewaartskapelle' 'Hoogstraten Meer'
 'Pussemange']


locality
Antwerp              262
Gent                 227
Uccle                176
Liège                163
Seraing              148
                    ... 
AUBY-SUR-SEMOIS        1
VISE                   1
WIDEUMONT VILLAGE      1
MEULEBEKE              1
Pussemange             1
Name: count, Length: 2443, dtype: int64

In [30]:
zip_code = analyze_column(df_clean, "zip_code")

Analysis for column 'zip_code':
Number of NaN values: 0
Number of unique values: 947
Percentage of NaN values: 0.00%
Percentage of unique values: 6.91%
Unique values: [9230. 9600. 4121. 4430. 4032. 4219. 4530. 4100. 6997. 7910. 1470. 1495.
 4671. 9000. 2100. 6940. 8510. 4257. 4540. 4260. 6780. 1000. 1800. 2600.
 9500. 1640. 4300. 9340. 4030. 1410. 1070. 1745. 6250. 1300. 1330. 8501.
 1933. 4500. 2140. 2310. 4040. 8790. 9700. 9260. 2223. 4287. 7063. 7170.
 7610. 4520. 9200. 1420. 6637. 6838. 2880. 2800. 9690. 1830. 2060. 2070.
 8940. 2018. 4470. 9100. 9050. 4800. 7521. 4122. 1190. 1731. 9031. 7500.
 8710. 9630. 9840. 8850. 4400. 1435. 2610. 3740. 9032. 1030. 4650. 9620.
 1120. 2000. 4101. 4420. 4683. 9300. 9660. 6792. 2970. 8670. 2020. 2570.
 2620. 4684. 4870. 6900. 2845. 9120. 4102. 1490. 4051. 6890. 4682. 4350.
 2910. 4690. 4900. 2240. 5000. 5100. 2870. 4841. 1180. 2980. 8490. 9310.
 2812. 9831. 6061. 7134. 2820. 1090. 4141. 5101. 1301. 5190. 8900. 1440.
 1500. 6560. 4432. 8630. 3570.

**Property type and subtype**

In [31]:
property_type = analyze_column(df_clean, "property_type")
# type of property(house/apartment)
# apartment group = 1545
# house group = 232

df_clean["property_type"].value_counts()


Analysis for column 'property_type':
Number of NaN values: 0
Number of unique values: 2
Percentage of NaN values: 0.00%
Percentage of unique values: 0.01%
Unique values: ['HOUSE' 'APARTMENT']


property_type
HOUSE        11326
APARTMENT     2384
Name: count, dtype: int64

In [32]:
property_subtype = analyze_column(df_clean, "property_subtype")
df_clean["property_subtype"].value_counts()

Analysis for column 'property_subtype':
Number of NaN values: 0
Number of unique values: 23
Percentage of NaN values: 0.00%
Percentage of unique values: 0.17%
Unique values: ['HOUSE' 'VILLA' 'MIXED_USE_BUILDING' 'APARTMENT_BLOCK'
 'EXCEPTIONAL_PROPERTY' 'BUNGALOW' 'MANSION' 'APARTMENT' 'TOWN_HOUSE'
 'COUNTRY_COTTAGE' 'PENTHOUSE' 'LOFT' 'FLAT_STUDIO' 'OTHER_PROPERTY'
 'GROUND_FLOOR' 'MANOR_HOUSE' 'DUPLEX' 'CHALET' 'SERVICE_FLAT' 'FARMHOUSE'
 'CASTLE' 'TRIPLEX' 'KOT']


property_subtype
HOUSE                   8180
APARTMENT               1866
VILLA                   1239
APARTMENT_BLOCK          519
MIXED_USE_BUILDING       492
EXCEPTIONAL_PROPERTY     198
MANSION                  185
TOWN_HOUSE               150
DUPLEX                   147
BUNGALOW                 137
GROUND_FLOOR             126
FLAT_STUDIO              109
COUNTRY_COTTAGE           80
PENTHOUSE                 78
FARMHOUSE                 55
MANOR_HOUSE               48
CHALET                    25
TRIPLEX                   18
LOFT                      18
SERVICE_FLAT              16
CASTLE                     9
OTHER_PROPERTY             9
KOT                        6
Name: count, dtype: int64

**Number of rooms**

In [33]:
number_rooms = analyze_column(df_clean, "number_rooms")
df_clean["number_rooms"].value_counts()

Analysis for column 'number_rooms':
Number of NaN values: 0
Number of unique values: 24
Percentage of NaN values: 0.00%
Percentage of unique values: 0.18%
Unique values: [ 2.  3.  4.  5.  6.  8. 28. 27.  7.  9.  1.  0. 10. 15. 17. 13. 11. 14.
 12. 25. 16. 18. 21. 19.]


number_rooms
3.0     4705
4.0     2967
2.0     2870
5.0     1324
1.0      695
6.0      503
7.0      213
0.0      172
8.0      116
9.0       43
10.0      33
12.0      16
11.0      13
13.0       9
15.0       7
14.0       5
17.0       4
28.0       3
16.0       3
18.0       3
27.0       2
19.0       2
25.0       1
21.0       1
Name: count, dtype: int64

**Price**

In [34]:
price = analyze_column(df_clean, "price")
df_clean["price"].value_counts()
df_clean["price"].describe()

Analysis for column 'price':
Number of NaN values: 0
Number of unique values: 1063
Percentage of NaN values: 0.00%
Percentage of unique values: 7.75%
Unique values: [ 249000.  249900.  259000. ... 1420000. 1645000.   42500.]


count    1.371000e+04
mean     4.841840e+05
std      5.633111e+05
min      2.300000e+04
25%      2.390000e+05
50%      3.450000e+05
75%      5.150000e+05
max      9.000000e+06
Name: price, dtype: float64

**Creating a new column called price_per_sqm**

In [35]:
# Create the new "price_per_sqm" column
df_clean['price_per_sqm'] = (df_clean['price'] / df_clean['living_area']).round(2)


**Count the missing values per column**

In [36]:
df_clean.isna().sum()
#df_clean.shape

url                           0
id                            0
region                        2
province                      2
locality                      0
zip_code                      0
Longitude                     1
Latitude                      1
property_type                 0
property_subtype              0
price                         0
number_rooms                  0
living_area                   0
kitchen                       1
furnished                  8433
fireplace                     0
terrace                       0
terrace_area                  0
garden                        0
garden_area                   0
surface_land                  1
number_facades                0
swimming_pool                 0
building_state                0
energy_type                   0
EPC_score                     0
EnergyConsumptionPerSqm       0
parking_outdoor               0
parking_indoor                0
price_per_sqm                 0
dtype: int64

In [38]:
df_clean.to_csv(url1)

**Outliers**

In [None]:
# Filter the DataFrame for rows with "apartment" type in the "property_type" column
df_apartment = df_clean[df_clean["property_type"] == "APARTMENT"]
df_apartment.shape


(9646, 32)

**Handle outliers per column**

In [None]:
def handle_outliers(df, columns_to_remove_outliers, zscore_threshold=3):
    """
    Handle outliers in specified columns of a DataFrame.
    Parameters:
        df (DataFrame): Input DataFrame.
        columns_to_remove_outliers (list): List of column names to remove outliers.
        zscore_threshold (float): Z-score threshold for identifying outliers. Default is 3.
    Returns:
        DataFrame: DataFrame with outliers removed.
    """
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_apartment = df.copy()

    # Iterate through each column in columns_to_remove_outliers
    for column_name in columns_to_remove_outliers:
        # Convert the column to a numeric data type, ignoring errors
        df_apartment[column_name] = pd.to_numeric(df_apartment[column_name], errors='coerce')

        # Calculate Z-scores for the specified column
        z_scores = (df_apartment[column_name] - df_apartment[column_name].mean()) / df_apartment[column_name].std()


        # Create a mask to identify outliers
        outlier_mask = np.abs(z_scores) > zscore_threshold

        # Print the columns for which outliers are being removed
        print("Removing outliers for column: " + column_name)

        # Count missing values
        missing_values_count = df_apartment[column_name].isna().sum()
        print("Number of missing values in " + column_name + ": " + str(missing_values_count))

    return df_apartment

# Call the modified function
columns_to_remove_outliers = ['price', 'number_rooms', 'living_area', 'number_facades', 'EnergyConsumptionPerSqm']
apartment_filtered = handle_outliers(df_apartment, columns_to_remove_outliers)

df_apartment.shape


Removing outliers for column: price
Number of missing values in price: 0
Removing outliers for column: number_rooms
Number of missing values in number_rooms: 0
Removing outliers for column: living_area
Number of missing values in living_area: 0
Removing outliers for column: number_facades
Number of missing values in number_facades: 202
Removing outliers for column: EnergyConsumptionPerSqm
Number of missing values in EnergyConsumptionPerSqm: 1228


(9646, 32)

In [None]:
df_apartment.dtypes

url                         object
id                         float64
region                      object
province                    object
locality                    object
zip_code                     int32
Longitude                   object
Latitude                    object
property_type               object
property_subtype            object
price                      float64
number_rooms               float64
living_area                float64
kitchen                     object
furnished                  float64
fireplace                   object
terrace                      int64
terrace_area               float64
garden                       int64
garden_area                float64
surface_land                object
number_facades              object
swimming_pool               object
building_state              object
energy_type                 object
EPC_score                   object
EnergyConsumptionPerSqm     object
parking_outdoor             object
parking_indoor      

In [None]:
df_apartment.describe()

Unnamed: 0,id,zip_code,price,number_rooms,living_area,furnished,terrace,terrace_area,garden,garden_area,price_per_sqm
count,9646.0,9646.0,9646.0,9646.0,9646.0,4191.0,9646.0,9646.0,9646.0,9646.0,9646.0
mean,11016370.0,4603.031827,420471.8,2.055774,106.765602,0.094965,0.654365,10.497719,0.07796,23.554634,4004.869388
std,239322.3,3307.408688,427587.1,1.012211,100.51075,0.293202,0.4756,70.536946,0.268122,443.777677,2850.177025
min,7858699.0,1000.0,1100.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,18.33
25%,10972980.0,1326.25,220000.0,2.0,75.0,0.0,0.0,0.0,0.0,0.0,2563.415
50%,11102710.0,3000.0,298000.0,2.0,94.0,0.0,1.0,2.0,0.0,0.0,3296.5
75%,11156160.0,8370.0,440000.0,3.0,120.0,0.0,1.0,12.0,0.0,0.0,4324.9175
max,11175490.0,9991.0,8100000.0,36.0,7819.0,1.0,1.0,6550.0,1.0,27000.0,34507.04
