In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import kagglehub


# Download latest version
path = kagglehub.dataset_download("luvathoms/portugal-real-estate-2024")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\tomas\.cache\kagglehub\datasets\luvathoms\portugal-real-estate-2024\versions\6


In [4]:
import pandas as pd

df = pd.read_csv(path+'\portugal_listinigs.csv')

In [5]:
df.drop_duplicates(inplace=True, keep='first')

In [6]:
# Dataframe summary

def summary(df):
    #print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['Data Type'])
    summ['Missing#'] = df.isna().sum()
    summ['Missing%'] = (df.isna().sum())/len(df)
    summ['Dups'] = df.duplicated().sum()
    summ['Uniques'] = df.nunique().values
    summ['Count'] = df.count().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['Min'] = desc['min'].values
    summ['Max'] = desc['max'].values
    summ['Average'] = desc['mean'].values
    summ['Standard Deviation'] = desc['std'].values
    summ['First Value'] = df.loc[0].values
    summ['Second Value'] = df.loc[1].values
    summ['Third Value'] = df.loc[2].values

    display(summ)

summary(df)

Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
Price,float64,267,0.002321,0,4474,114768,1.0,1380000000.0,370021.551309,4123539.066734,780000.0,223000.0,228000.0
District,object,0,0.0,0,25,115035,,,,,Vila Real,Faro,Faro
City,object,0,0.0,0,272,115035,,,,,Valpaços,São Brás de Alportel,São Brás de Alportel
Town,object,2,1.7e-05,0,2246,115033,,,,,Carrazedo de Montenegro e Curros,São Brás de Alportel,São Brás de Alportel
Type,object,15,0.00013,0,21,115020,,,,,Farm,Apartment,Apartment
EnergyCertificate,object,13,0.000113,0,12,115022,,,,,NC,A+,A+
GrossArea,float64,89021,0.77386,0,2267,26014,-7.0,12750000.0,2927.940647,118909.259565,200.0,,
TotalArea,float64,6834,0.059408,0,7214,108201,-7196067.0,61420071105.0,603655.699855,186774276.614159,552450.0,81.0,108.0
Parking,float64,146,0.001269,0,4,114889,0.0,3.0,0.560985,0.864944,0.0,1.0,1.0
HasParking,object,51153,0.444673,0,2,63882,,,,,False,True,True


In [7]:
selected_types = ['Apartment', 'House', 'Land','Farm']
# Filter the dataset
filtered_df = df[df['Type'].isin(selected_types)].copy()

In [8]:
summary(filtered_df)

Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
Price,float64,229,0.00226,0,4209,101105,1.0,1380000000.0,358327.543748,4376146.009492,780000.0,223000.0,228000.0
District,object,0,0.0,0,24,101334,,,,,Vila Real,Faro,Faro
City,object,0,0.0,0,270,101334,,,,,Valpaços,São Brás de Alportel,São Brás de Alportel
Town,object,2,2e-05,0,2233,101332,,,,,Carrazedo de Montenegro e Curros,São Brás de Alportel,São Brás de Alportel
Type,object,0,0.0,0,4,101334,,,,,Farm,Apartment,Apartment
EnergyCertificate,object,0,0.0,0,12,101334,,,,,NC,A+,A+
GrossArea,float64,78006,0.769791,0,2080,23328,-7.0,2181280.0,1626.677898,26743.570967,200.0,,
TotalArea,float64,6808,0.067184,0,6914,94526,-7196067.0,61420071105.0,668721.719379,199776237.283593,552450.0,81.0,108.0
Parking,float64,124,0.001224,0,4,101210,0.0,3.0,0.585891,0.869686,0.0,1.0,1.0
HasParking,object,44989,0.443967,0,2,56345,,,,,False,True,True


In [9]:
# drop all the rows with null number of bedrooms
filtered_df = filtered_df.dropna(subset=['NumberOfBedrooms'])

In [10]:
#reset the index
filtered_df.reset_index(drop=True, inplace=True)
summary(filtered_df)

Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
Price,float64,102,0.003081,0,3033,32999,1.0,20000000.0,400981.360494,558113.509553,98000.0,299900.0,62000.0
District,object,0,0.0,0,24,33101,,,,,Aveiro,Aveiro,Aveiro
City,object,0,0.0,0,267,33101,,,,,Albergaria-a-Velha,Águeda,Ovar
Town,object,1,3e-05,0,2094,33100,,,,,Albergaria-a-Velha e Valmaior,Valongo do Vouga,Válega
Type,object,0,0.0,0,4,33101,,,,,Apartment,Farm,House
EnergyCertificate,object,0,0.0,0,11,33101,,,,,F,NC,F
GrossArea,float64,33101,1.0,0,0,0,,,,,,,
TotalArea,float64,177,0.005347,0,1851,32924,-271.0,920000.0,880.457842,11381.191139,81.0,446.0,336.0
Parking,float64,0,0.0,0,2,33101,0.0,1.0,0.373826,0.483825,0.0,0.0,1.0
HasParking,object,33101,1.0,0,0,0,,,,,,,


In [11]:
# to test ydata_profiling whith a smaller data set
filtered_df.drop(["GrossArea", "HasParking", "Floor", "PublishDate", "TotalRooms","ConservationStatus", "LotSize","BuiltArea"], axis=1, inplace=True)
filtered_df.dropna(subset=['Price'],inplace=True)



In [12]:
filtered_df = filtered_df.reset_index(drop=True)
summary(filtered_df)

Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
Price,float64,0,0.0,1058,3033,32999,1.0,20000000.0,400981.360494,558113.509553,98000.0,299900.0,62000.0
District,object,0,0.0,1058,24,32999,,,,,Aveiro,Aveiro,Aveiro
City,object,0,0.0,1058,267,32999,,,,,Albergaria-a-Velha,Águeda,Ovar
Town,object,1,3e-05,1058,2094,32998,,,,,Albergaria-a-Velha e Valmaior,Valongo do Vouga,Válega
Type,object,0,0.0,1058,4,32999,,,,,Apartment,Farm,House
EnergyCertificate,object,0,0.0,1058,11,32999,,,,,F,NC,F
TotalArea,float64,177,0.005364,1058,1849,32822,-271.0,920000.0,882.560356,11398.798125,81.0,446.0,336.0
Parking,float64,0,0.0,1058,2,32999,0.0,1.0,0.373163,0.483652,0.0,0.0,1.0
ConstructionYear,float64,4024,0.121943,1058,120,28975,1900.0,2024.0,1990.627748,28.217448,1992.0,1951.0,1950.0
EnergyEfficiencyLevel,object,0,0.0,1058,11,32999,,,,,F,NC,F


In [13]:
filtered_df.dropna(subset=['ConstructionYear'],inplace=True)

In [14]:
filtered_df.dropna(subset=['NumberOfWC'],inplace=True)

In [15]:
filtered_df.reset_index(drop=True, inplace=True)
summary(filtered_df)

Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
Price,float64,0,0.0,569,2543,25864,600.0,16800000.0,399597.442487,556365.368372,98000.0,299900.0,290000.0
District,object,0,0.0,569,23,25864,,,,,Aveiro,Aveiro,Aveiro
City,object,0,0.0,569,266,25864,,,,,Albergaria-a-Velha,Águeda,Santa Maria da Feira
Town,object,0,0.0,569,2033,25864,,,,,Albergaria-a-Velha e Valmaior,Valongo do Vouga,"Canedo, Vale e Vila Maior"
Type,object,0,0.0,569,4,25864,,,,,Apartment,Farm,House
EnergyCertificate,object,0,0.0,569,11,25864,,,,,F,NC,F
TotalArea,float64,31,0.001199,569,1349,25833,-271.0,920000.0,696.878295,11270.079957,81.0,446.0,137.0
Parking,float64,0,0.0,569,2,25864,0.0,1.0,0.354624,0.478409,0.0,0.0,0.0
ConstructionYear,float64,0,0.0,569,119,25864,1900.0,2024.0,1990.704299,28.500401,1992.0,1951.0,1992.0
EnergyEfficiencyLevel,object,0,0.0,569,11,25864,,,,,F,NC,F


In [16]:
filtered_df.dropna(subset=['TotalArea'],inplace=True)

In [17]:
filtered_df.dropna(subset=['LivingArea'],inplace=True)

In [18]:
filtered_df.reset_index(drop=True, inplace=True)
summary(filtered_df)

Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
Price,float64,0,0.0,567,2534,25668,600.0,16800000.0,398188.589001,551267.235558,98000.0,299900.0,290000.0
District,object,0,0.0,567,23,25668,,,,,Aveiro,Aveiro,Aveiro
City,object,0,0.0,567,265,25668,,,,,Albergaria-a-Velha,Águeda,Santa Maria da Feira
Town,object,0,0.0,567,2026,25668,,,,,Albergaria-a-Velha e Valmaior,Valongo do Vouga,"Canedo, Vale e Vila Maior"
Type,object,0,0.0,567,4,25668,,,,,Apartment,Farm,House
EnergyCertificate,object,0,0.0,567,11,25668,,,,,F,NC,F
TotalArea,float64,0,0.0,567,1321,25668,-271.0,920000.0,687.143291,11279.715196,81.0,446.0,137.0
Parking,float64,0,0.0,567,2,25668,0.0,1.0,0.355034,0.478533,0.0,0.0,0.0
ConstructionYear,float64,0,0.0,567,119,25668,1900.0,2024.0,1990.790634,28.466682,1992.0,1951.0,1992.0
EnergyEfficiencyLevel,object,0,0.0,567,11,25668,,,,,F,NC,F


In [19]:
filtered_df.shape

(25668, 17)

In [20]:
filtered_df.to_csv('filtered_df.csv', index=False)

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
# Preprocessing for categorical and numerical features
categorical_features = ['District', 'City', 'Town', 'Type', 'EnergyCertificate', 
                        'EnergyEfficiencyLevel', 'Garage', 'Elevator', 'ElectricCarsCharging']
numerical_features = ['TotalArea', 'Parking', 'ConstructionYear', 'NumberOfBedrooms', 
                      'NumberOfWC', 'LivingArea', 'NumberOfBathrooms']


In [29]:
# Separating features and target
X = filtered_df[categorical_features + numerical_features]
y = filtered_df['Price']

# 1. Label Encode Categorical Features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Store the encoder for future use

# 2. Standard Scale Numerical Features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
from sklearn.model_selection import GridSearchCV

# Parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 200,300],        # Number of trees in the forest
    'max_depth': [5, 10, 20, 30],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}

# Setting up the GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Optimization metric
    n_jobs=-1,  # Use all available cores
    verbose=2  # Output progress
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Metrics for the best model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Model Mean Squared Error:", mse)
print("Best Model R2 Score:", r2)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Hyperparameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Model Mean Squared Error: 124840177611.6476
Best Model R2 Score: 0.6041779544020307


In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Instantiate the Linear Regression model
lin_regressor = LinearRegression()

# Fit the model to the training data
lin_regressor.fit(X_train, y_train)

# Make predictions
y_pred = lin_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 213660199065.48862
R^2 Score: 0.32256250611837867


In [32]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
dt_regressor = DecisionTreeRegressor(max_depth=5, random_state=42)  # Adjust max_depth as needed

# Fit the model to the training data
dt_regressor.fit(X_train, y_train)

# Make predictions
y_pred = dt_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
#plt.figure(figsize=(20, 10))
#plot_tree(dt_regressor, feature_names=X.columns, filled=True, rounded=True, fontsize=12)
#plt.title("Decision Tree Visualization", fontsize=16)
#plt.show()

Mean Squared Error: 181434122425.35922
R^2 Score: 0.4247394800808286
