In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

pd.set_option('display.max_columns', None) #Removes upper limit on the number of columns to display

In [73]:
#Load data
data = pd.read_csv('house_train.csv')
test_data = pd.read_csv('house_test.csv')
#Save transaction id for testing before we start feature engineering
transaction_id_test = test_data['TRANSACTION_ID']
#Transform price to log price. This makes it approximately follow a Gaussian distribution.
data['PRICE'] = np.log(data['PRICE'])
test_data['PRICE'] = np.log(test_data['PRICE'])

data.head()
test_data.head()

Unnamed: 0,TRANSACTION_ID,BUILDING_ID,UNIT_ID,FLOOR,MUNICIPALITY_CODE,ZIP_CODE,STREET_CODE,TRADE_DATE,PRICE,SQM_PRICE,CONSTRUCTION_YEAR,REBUILDING_YEAR,AREA_TINGLYST,AREA_RESIDENTIAL,AREA_OTHER,AREA_COMMON_ACCESS_SHARE,AREA_CLOSED_COVER_OUTHOUSE,AREA_OPEN_BALCONY_ROOFTOP,NUMBER_ROOMS,FACILITIES_TOILET,FACILITIES_SHOWER,FACILITIES_KITCHEN,HAS_ELEVATOR,LNG,LAT,DISTANCE_LAKE,DISTANCE_HARBOUR,DISTANCE_COAST
0,f22f66a6-eda2-d049-ea24-09df907e8f38,cf96cec5-f3c6-1b52-6784-c3a25647f8b2,24e6b3bb-29b2-3a1e-7548-622f9c086a08,2,265,4000.0,2656573.0,2024-10-06,,,1962.0,1998.0,71,76,,,,,3,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,False,12.094847,55.627949,1415.2,2613.7,2612.8
1,c8cb2732-203f-3b00-4b2b-78ea45ebb5b2,4495a41f-561b-06db-b33c-384a1a83b158,41c02eab-2b4a-3a43-1c92-7dca8398cf54,2,187,2625.0,187106.0,2024-10-04,,,1964.0,,63,68,,,,,2,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,False,12.354648,55.652842,871.7,4984.9,4886.3
2,c8f1227e-762c-995d-1315-89bc66f6a9d3,d1a02511-53e8-98e0-55a6-db15d7a71745,3e16a170-88da-794c-d4e6-fe3a218cc33e,3,101,2150.0,1011882.0,2024-10-04,,,2017.0,,69,83,,,,13.0,3,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,True,12.598294,55.708251,637.8,23.0,70.3
3,504eb7fd-becf-b3c3-9cc9-53492c7b4bcc,ac0d66d6-3a0a-1cf8-3075-7f9f0dd3e275,c00f9c8a-6cf2-63a9-6c42-a9370bc963ef,3,101,2200.0,101508.0,2024-10-03,,,1892.0,,45,52,,,,5.0,2,T: Vandskyllende toilet i enheden,C: Adgang til badeværelse,E: Eget køkken med afløb,False,12.558023,55.690497,453.1,1935.4,2573.5
4,a1a89478-d718-4df0-cc6c-bec4ef109e19,0e3f916c-85b8-c9bb-5517-18814f1efd8d,366333f0-a55f-4efd-1a7f-90c234b52d5a,2,153,2605.0,15371.0,2024-10-03,,,1967.0,,43,46,,,,,2,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,False,12.409968,55.657729,1569.5,4906.3,4983.6


## Feature Engineering

In [74]:
#Calculates mean sqm price for municipality.
data_mean_sqm_price = data.groupby("MUNICIPALITY_CODE")["SQM_PRICE"].mean()
data["MUNICIPALITY_MEAN_SQM_PRICE"] = data.apply(
    lambda row: data_mean_sqm_price[row["MUNICIPALITY_CODE"]],
    axis=1
)
test_data["MUNICIPALITY_MEAN_SQM_PRICE"] = test_data.apply(
    lambda row: data_mean_sqm_price[row["MUNICIPALITY_CODE"]],
    axis=1
)

#Calculates mean sqm price for street.
street_mean_sqm_price = data.groupby("STREET_CODE")["SQM_PRICE"].mean()

data["STREET_CODE_MEAN_SQM_PRICE"] = data["STREET_CODE"].map(street_mean_sqm_price)
test_data["STREET_CODE_MEAN_SQM_PRICE"] = test_data["STREET_CODE"].map(street_mean_sqm_price)

data["STREET_CODE_MEAN_SQM_PRICE"].fillna(data["SQM_PRICE"].mean(), inplace=True)
test_data["STREET_CODE_MEAN_SQM_PRICE"].fillna(data["SQM_PRICE"].mean(), inplace=True)

data.head()
test_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["STREET_CODE_MEAN_SQM_PRICE"].fillna(data["SQM_PRICE"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data["STREET_CODE_MEAN_SQM_PRICE"].fillna(data["SQM_PRICE"].mean(), inplace=True)


Unnamed: 0,TRANSACTION_ID,BUILDING_ID,UNIT_ID,FLOOR,MUNICIPALITY_CODE,ZIP_CODE,STREET_CODE,TRADE_DATE,PRICE,SQM_PRICE,CONSTRUCTION_YEAR,REBUILDING_YEAR,AREA_TINGLYST,AREA_RESIDENTIAL,AREA_OTHER,AREA_COMMON_ACCESS_SHARE,AREA_CLOSED_COVER_OUTHOUSE,AREA_OPEN_BALCONY_ROOFTOP,NUMBER_ROOMS,FACILITIES_TOILET,FACILITIES_SHOWER,FACILITIES_KITCHEN,HAS_ELEVATOR,LNG,LAT,DISTANCE_LAKE,DISTANCE_HARBOUR,DISTANCE_COAST,MUNICIPALITY_MEAN_SQM_PRICE,STREET_CODE_MEAN_SQM_PRICE
0,f22f66a6-eda2-d049-ea24-09df907e8f38,cf96cec5-f3c6-1b52-6784-c3a25647f8b2,24e6b3bb-29b2-3a1e-7548-622f9c086a08,2,265,4000.0,2656573.0,2024-10-06,,,1962.0,1998.0,71,76,,,,,3,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,False,12.094847,55.627949,1415.2,2613.7,2612.8,26822.223099,25159.54142
1,c8cb2732-203f-3b00-4b2b-78ea45ebb5b2,4495a41f-561b-06db-b33c-384a1a83b158,41c02eab-2b4a-3a43-1c92-7dca8398cf54,2,187,2625.0,187106.0,2024-10-04,,,1964.0,,63,68,,,,,2,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,False,12.354648,55.652842,871.7,4984.9,4886.3,22081.724044,18654.296053
2,c8f1227e-762c-995d-1315-89bc66f6a9d3,d1a02511-53e8-98e0-55a6-db15d7a71745,3e16a170-88da-794c-d4e6-fe3a218cc33e,3,101,2150.0,1011882.0,2024-10-04,,,2017.0,,69,83,,,,13.0,3,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,True,12.598294,55.708251,637.8,23.0,70.3,43109.941311,59125.474359
3,504eb7fd-becf-b3c3-9cc9-53492c7b4bcc,ac0d66d6-3a0a-1cf8-3075-7f9f0dd3e275,c00f9c8a-6cf2-63a9-6c42-a9370bc963ef,3,101,2200.0,101508.0,2024-10-03,,,1892.0,,45,52,,,,5.0,2,T: Vandskyllende toilet i enheden,C: Adgang til badeværelse,E: Eget køkken med afløb,False,12.558023,55.690497,453.1,1935.4,2573.5,43109.941311,55779.191489
4,a1a89478-d718-4df0-cc6c-bec4ef109e19,0e3f916c-85b8-c9bb-5517-18814f1efd8d,366333f0-a55f-4efd-1a7f-90c234b52d5a,2,153,2605.0,15371.0,2024-10-03,,,1967.0,,43,46,,,,,2,T: Vandskyllende toilet i enheden,V: Badeværelser i enheden,E: Eget køkken med afløb,False,12.409968,55.657729,1569.5,4906.3,4983.6,24220.261411,22869.171717


In [75]:
#Generates a report for exploratory data analysis and saves it to the folder containing this script.
import os
from ydata_profiling import ProfileReport
if not os.path.isfile("eda_report.html"):
    profile_report = ProfileReport(df=data)
    profile_report.to_file("eda_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [76]:
def clean_data(data):
    #Drop unneeded columns.
    columns_to_drop = ["TRANSACTION_ID", "BUILDING_ID", "UNIT_ID", "SQM_PRICE", "MUNICIPALITY_CODE"]
    try: 
        data = data.drop(columns=columns_to_drop)
    except KeyError:
        pass #Error occurs if columns have already been dropped.
    
    #For the columns with missing values, we replace the empty values by the median of the column.
    columns_to_fill_missing_values = ["REBUILDING_YEAR", "AREA_OTHER", "AREA_COMMON_ACCESS_SHARE", "AREA_CLOSED_COVER_OUTHOUSE", "AREA_OPEN_BALCONY_ROOFTOP", "ZIP_CODE", "STREET_CODE", "CONSTRUCTION_YEAR"]
    for column in columns_to_fill_missing_values:
        data[column] = data[column].fillna(data[column].median())

    #Floor column is not numeric, so we replace the empty rows with a string instead.
    data["FLOOR"] = data["FLOOR"].fillna("na")

    #Convert datetime to year and month.
    data['TRADE_DATE'] = pd.to_datetime(data['TRADE_DATE'])
    data['YEAR'] = data['TRADE_DATE'].dt.year
    data['MONTH'] = data['TRADE_DATE'].dt.month
    
    #Calculate time since construction as a feature
    data['TIME_SINCE_CONSTRUCTION'] = data['TRADE_DATE'].dt.year - data['CONSTRUCTION_YEAR']

    #Drop datetime since it is no longer needed
    data = data.drop(columns=['TRADE_DATE'])
    return data

data = clean_data(data)
test_data = clean_data(test_data)

In [77]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data.head()
def scale_data(train_data, test_data):
    #we scale all non-binary features to improve model performance
    #we don't normalize price, since that's our y to predict
    standard_scaler = StandardScaler()
    columns_to_feature_scale = ["YEAR", "MONTH", "STREET_CODE_MEAN_SQM_PRICE", "TIME_SINCE_CONSTRUCTION", "MUNICIPALITY_MEAN_SQM_PRICE", "CONSTRUCTION_YEAR", "REBUILDING_YEAR", "AREA_TINGLYST", "AREA_RESIDENTIAL", "AREA_OTHER", "AREA_COMMON_ACCESS_SHARE", "AREA_CLOSED_COVER_OUTHOUSE", "AREA_OPEN_BALCONY_ROOFTOP", "NUMBER_ROOMS", "DISTANCE_LAKE", "DISTANCE_HARBOUR", "DISTANCE_COAST"]

    train_data[columns_to_feature_scale] = standard_scaler.fit_transform(train_data[columns_to_feature_scale])
    test_data[columns_to_feature_scale] = standard_scaler.transform(test_data[columns_to_feature_scale])
    
    train_data['FLOOR'] = train_data['FLOOR'].apply(lambda x: '3' if x.isdigit() and int(x) > 3 else x)
    test_data['FLOOR'] = test_data['FLOOR'].apply(lambda x: '3' if x.isdigit() and int(x) > 3 else x)

    encoder = OneHotEncoder(drop='first', sparse_output=False)
    train_encoded = encoder.fit_transform(train_data[['FLOOR', 'FACILITIES_TOILET', 'FACILITIES_SHOWER', 'FACILITIES_KITCHEN']])
    test_encoded = encoder.transform(test_data[['FLOOR', 'FACILITIES_TOILET', 'FACILITIES_SHOWER', 'FACILITIES_KITCHEN']])

    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out())
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out())

    train_data = pd.concat([train_data.reset_index(drop=True), train_encoded_df.reset_index(drop=True)], axis=1)
    test_data = pd.concat([test_data.reset_index(drop=True), test_encoded_df.reset_index(drop=True)], axis=1)

    train_data = train_data.drop(columns=['FLOOR', 'FACILITIES_TOILET', 'FACILITIES_SHOWER', 'FACILITIES_KITCHEN'])
    test_data = test_data.drop(columns=['FLOOR', 'FACILITIES_TOILET', 'FACILITIES_SHOWER', 'FACILITIES_KITCHEN'])
    
    train_data = train_data.drop(columns=['ZIP_CODE', 'STREET_CODE',], errors='ignore')
    test_data = test_data.drop(columns=['ZIP_CODE', 'STREET_CODE'], errors='ignore')

    return train_data, test_data

data, test_data = scale_data(data, test_data)

#show top 5 rows
data.head()


Unnamed: 0,PRICE,CONSTRUCTION_YEAR,REBUILDING_YEAR,AREA_TINGLYST,AREA_RESIDENTIAL,AREA_OTHER,AREA_COMMON_ACCESS_SHARE,AREA_CLOSED_COVER_OUTHOUSE,AREA_OPEN_BALCONY_ROOFTOP,NUMBER_ROOMS,HAS_ELEVATOR,LNG,LAT,DISTANCE_LAKE,DISTANCE_HARBOUR,DISTANCE_COAST,MUNICIPALITY_MEAN_SQM_PRICE,STREET_CODE_MEAN_SQM_PRICE,YEAR,MONTH,TIME_SINCE_CONSTRUCTION,FLOOR_2,FLOOR_3,FLOOR_kl,FLOOR_na,FLOOR_st,FACILITIES_TOILET_B: Intet vandskyllende toilet,FACILITIES_TOILET_T: Vandskyllende toilet i enheden,FACILITIES_SHOWER_D: Hverken badeværelse eller adgang til badeværelse,FACILITIES_SHOWER_V: Badeværelser i enheden,FACILITIES_KITCHEN_F: Adgang til fælles køkken,FACILITIES_KITCHEN_G: Fast kogeinstallation i værelse eller på gang,FACILITIES_KITCHEN_H: Ingen fast kogeinstallation
0,15.250595,0.321784,0.069108,0.532912,0.615859,-0.055133,-0.033663,-0.005372,-0.2702,0.237836,False,12.522463,55.692823,0.272304,0.36547,0.222343,1.265248,0.297865,1.849094,-0.08305,-0.215575,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,15.046612,1.416627,0.069108,-0.722708,-0.532459,-0.055133,-0.033663,-0.005372,-0.128133,0.237836,True,12.556716,55.65207,-0.355159,-0.663611,-0.628456,0.925935,1.598902,1.849094,-0.08305,-1.309531,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,15.237413,-0.472514,-3.205841,-0.244377,-0.222103,-0.055133,-0.033663,-0.005372,-0.128133,0.237836,False,12.529022,55.691044,0.192585,0.242152,0.161167,1.265248,0.883969,1.849094,-0.08305,0.57808,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,14.601693,0.514992,0.069108,0.712286,0.460681,-0.055133,-0.033663,-0.005372,-0.128133,1.222516,False,12.492201,55.911679,0.615676,0.303946,-0.298599,-1.415407,-1.234327,1.849094,-0.08305,-0.408626,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,14.253765,-1.009201,-1.624831,0.383433,-0.09796,-0.055133,0.58321,-0.005372,-0.128133,0.237836,False,10.371957,55.39595,1.80775,-0.329146,-0.342923,-1.103059,-0.956726,1.849094,-0.08305,1.114333,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [78]:
#Setup x and y.
y_train = data["PRICE"]
X_train = data.drop(["PRICE"], axis=1)
y_test = test_data["PRICE"]
X_test = test_data.drop(["PRICE"], axis=1)

## Comparing Algorithms

In [83]:
#This section is comparing multiple algorithms:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
import pandas as pd

#Number of rows to train on.
n_samples = 50000
random_indices = np.random.choice(X_train.shape[0], size=n_samples, replace=False)

X_train_reset = X_train.reset_index(drop=True)
y_train_reset = y_train.reset_index(drop=True)

X_train_subset = X_train_reset.iloc[random_indices]
y_train_subset = y_train_reset.iloc[random_indices]

#Defines the models.
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'HistGradientBoosting': HistGradientBoostingRegressor(),
    'XGBoost': XGBRegressor(eval_metric='rmse'),
}

results = {}

#Trains and cross-validates each model.
for model_name, model in models.items():
    print(f"Training {model_name}")
    cv_results = cross_validate(
        model, X_train_subset, y_train_subset,
        scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'],
        cv=3, 
        return_train_score=False,
        n_jobs=-1
    )
    
    results[model_name] = {
        'MAE': -cv_results['test_neg_mean_absolute_error'],
        'MSE': -cv_results['test_neg_mean_squared_error'],
        'R-squared': cv_results['test_r2']
    }

#Summarizes the results.
results_summary = pd.DataFrame({
    model_name: {
        'MAE Mean': values['MAE'].mean(),
        'MAE Std': values['MAE'].std(),
        'MSE Mean': values['MSE'].mean(),
        'MSE Std': values['MSE'].std(),
        'R-squared Mean': values['R-squared'].mean(),
        'R-squared Std': values['R-squared'].std()
    } for model_name, values in results.items()
}).T

print(results_summary)


Training Linear Regression
Training Ridge Regression
Training Lasso Regression
Training Random Forest


KeyboardInterrupt: 

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

#First model: Linear regression. Chosen due to its simplicity.
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

pred = lin_model.predict(X_test)
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
cross_validate(lin_model, X_train, y_train, scoring=scoring, cv=5, return_train_score=False)

{'fit_time': array([0.29627419, 0.23209643, 0.18280506, 0.14664054, 0.12791252]),
 'score_time': array([0.01104259, 0.0092659 , 0.00745106, 0.00700116, 0.00800085]),
 'test_neg_mean_absolute_error': array([-0.16867294, -0.16392723, -0.14722774, -0.15476017, -0.17025426]),
 'test_neg_mean_squared_error': array([-0.05791642, -0.04518976, -0.04117433, -0.04393354, -0.05423865]),
 'test_r2': array([0.86090316, 0.88986935, 0.88626109, 0.8782168 , 0.84331204])}

## Random Forest

In [None]:
#Second model: Random forest. Chosen because it was the best performing in initial testing.
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(n_estimators=400,
                                            max_depth=25,
                                            n_jobs=-1)
random_forest_model.fit(X_train, y_train)

pred = random_forest_model.predict(X_test)

## Neural Network

In [85]:
#Third model: Sequential neural network. It did not manage to outperform random forest.
from keras import layers, models
from keras import callbacks
from keras import optimizers

early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

#Defines the model
model = models.Sequential([
    layers.Dense(256*2, activation="relu", input_shape=(X_train.shape[1],)),
    layers.Dense(128*2, activation="relu"),    
    layers.Dense(64*2, activation="relu"),
    layers.Dense(1, activation="linear")
])

model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}')

#Graph training.
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
pred = model.predict(X_test)
model.save('price_prediction_model.h5')



Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 12.7440 - mae: 0.6698 - val_loss: 0.0488 - val_mae: 0.1565
Epoch 2/100
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0698 - mae: 0.1717 - val_loss: 0.0410 - val_mae: 0.1452
Epoch 3/100
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0863 - mae: 0.1774 - val_loss: 0.0523 - val_mae: 0.1772
Epoch 4/100
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.2037 - mae: 0.2103 - val_loss: 0.0585 - val_mae: 0.1759
Epoch 5/100
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0447 - mae: 0.1577 - val_loss: 0.0393 - val_mae: 0.1412
Epoch 6/100
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0461 - mae: 0.1618 - val_loss: 0.0498 - val_mae: 0.1648
Epoch 7/100
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2

  plt.show()


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step




## XGBoost

In [88]:
#Fourth model: XGBoost. Chosen because it started to outperform random forest after additional feature engineering.
from xgboost import XGBRegressor

xgboost_model = XGBRegressor(max_depth=10,
    learning_rate=0.08,            #Learning rate of 0.08 with 840 estimators gave the best 20% performance.
    n_estimators=840,              #Learning rate of 0.05 with 1200 estimators gave the best 5% performance.
    objective='reg:squarederror', 
    subsample=0.8,                
    colsample_bytree=0.8,         
    n_jobs=-1,
    eval_metric="rmse",
)
eval_set = [(X_train, y_train)]
    
xgboost_model.fit(X_train, y_train, eval_set=eval_set)
pred = xgboost_model.predict(X_test)

scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

results = xgboost_model.evals_result()

[0]	validation_0-rmse:0.58259
[1]	validation_0-rmse:0.54130
[2]	validation_0-rmse:0.50377
[3]	validation_0-rmse:0.46816
[4]	validation_0-rmse:0.43519
[5]	validation_0-rmse:0.40675
[6]	validation_0-rmse:0.38280
[7]	validation_0-rmse:0.35868
[8]	validation_0-rmse:0.33532
[9]	validation_0-rmse:0.31500
[10]	validation_0-rmse:0.29563
[11]	validation_0-rmse:0.27905
[12]	validation_0-rmse:0.26304
[13]	validation_0-rmse:0.24857
[14]	validation_0-rmse:0.23627
[15]	validation_0-rmse:0.22519
[16]	validation_0-rmse:0.21449
[17]	validation_0-rmse:0.20492
[18]	validation_0-rmse:0.19642
[19]	validation_0-rmse:0.18881
[20]	validation_0-rmse:0.18233
[21]	validation_0-rmse:0.17615
[22]	validation_0-rmse:0.17100
[23]	validation_0-rmse:0.16612
[24]	validation_0-rmse:0.16177
[25]	validation_0-rmse:0.15791
[26]	validation_0-rmse:0.15454
[27]	validation_0-rmse:0.15150
[28]	validation_0-rmse:0.14880
[29]	validation_0-rmse:0.14643
[30]	validation_0-rmse:0.14431
[31]	validation_0-rmse:0.14239
[32]	validation_0-

All of the best performing models stopped training around a validation MSE of 0.67-0.73. This appears to be ideal - if we go lower, we start to overfit, and if we go higher, we start to underfit.

## Hyperparameter Tuning

In [None]:
#Hyperparameter tuning for XGBoost.
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

param_grid = {
    'max_depth': [10],
    'objective': ['reg:squarederror'],
    'learning_rate': [ 0.05, 0.1], 
    'n_estimators': [500],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
}

#Initializes the model.
xgboost_model = XGBRegressor(max_depth=10, objective='reg:squarederror', n_jobs=-1)

#Sets up grid search.
grid_search = GridSearchCV(
    estimator=xgboost_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Use an appropriate scoring metric
    cv=2,
    verbose=1,
    n_jobs=-1
)

#Runs the grid search.
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 500, 'objective': 'reg:squarederror', 'subsample': 0.9}
Best Score: 0.0417844388155762


                              MAE Mean       MAE Std      MSE Mean  \
Linear Regression         8.529019e+05    607.830480  1.798478e+12   
Ridge Regression          8.529758e+05    590.852778  1.797454e+12   
Lasso Regression          8.529018e+05    607.904049  1.798473e+12   
Random Forest             5.576356e+05   9971.338426  1.053942e+12   
Support Vector Regressor  1.345384e+06   6188.680430  4.800320e+12   
HistGradientBoosting      5.899489e+05  12916.581742  1.065085e+12   
XGBoost                   5.609291e+05   9393.500434  9.874340e+11   

                               MSE Std  R-squared Mean  R-squared Std  
Linear Regression         5.291035e+10        0.601023       0.001420  
Ridge Regression          5.403395e+10        0.601257       0.001617  
Lasso Regression          5.292829e+10        0.601024       0.001424  
Random Forest             9.001448e+10        0.766490       0.014345  
Support Vector Regressor  1.357655e+11       -0.064953       0.005290  
HistGradientBoosting      1.048520e+11        0.764013       0.018443  
XGBoost                   3.081381e+10        0.780952       0.001763  

## Identifying Bad Features

In [None]:
#This section is for testing which features contribute the least to accurately predicting prices.
from sklearn.feature_selection import RFE

#Specifies what percentage of features we want to be selected as the best features.
n_features_to_select = 0.5

rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)

#Fits RFE.
rfe.fit(data, y_train.fillna(y_train.median()))

selected_features = data.columns[rfe.support_]
print("Selected features:", selected_features)
print(data.columns)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected features: Index(['PRICE', 'AREA_TINGLYST', 'AREA_RESIDENTIAL',
       'AREA_COMMON_ACCESS_SHARE', 'AREA_OPEN_BALCONY_ROOFTOP', 'NUMBER_ROOMS',
       'LNG', 'LAT', 'DISTANCE_HARBOUR', 'MUNICIPALITY_MEAN_SQM_PRICE',
       'STREET_CODE_MEAN_SQM_PRICE', 'YEAR', 'MONTH',
       'TIME_SINCE_CONSTRUCTION', 'FLOOR_2', 'FLOOR_3'],
      dtype='object')
Index(['PRICE', 'CONSTRUCTION_YEAR', 'REBUILDING_YEAR', 'AREA_TINGLYST',
       'AREA_RESIDENTIAL', 'AREA_OTHER', 'AREA_COMMON_ACCESS_SHARE',
       'AREA_CLOSED_COVER_OUTHOUSE', 'AREA_OPEN_BALCONY_ROOFTOP',
       'NUMBER_ROOMS', 'HAS_ELEVATOR', 'LNG', 'LAT', 'DISTANCE_LAKE',
       'DISTANCE_HARBOUR', 'DISTANCE_COAST', 'MUNICIPALITY_MEAN_SQM_PRICE',
       'STREET_CODE_MEAN_SQM_PRICE', 'YEAR', 'MONTH',
       'TIME_SINCE_CONSTRUCTION', 'FLOOR_2', 'FLOOR_3', 'FLOOR_kl', 'FLOOR_na',
       'FLOOR_st', 'FACILITIES_TOILET_B: Intet vandskyllende toilet',
       'FACILITIES_TOILET_T: Vandskyllende toilet i enheden',
       'FACILITIES_SHOW

## Prediction Normalization

In [89]:
#Call exp on our predicted log prices to convert them back to normal prices.
test_data["PRICE"] = np.exp(pred)
test_data["TRANSACTION_ID"] = transaction_id_test

predictions = test_data[["TRANSACTION_ID", "PRICE"]].to_dict("records")
predictions[:5] #Print the first 5 predictions. Models tend to perform well when the very first price is close to 2,100,000.

[{'TRANSACTION_ID': 'f22f66a6-eda2-d049-ea24-09df907e8f38',
  'PRICE': 2126397.0},
 {'TRANSACTION_ID': 'c8cb2732-203f-3b00-4b2b-78ea45ebb5b2',
  'PRICE': 1329640.375},
 {'TRANSACTION_ID': 'c8f1227e-762c-995d-1315-89bc66f6a9d3',
  'PRICE': 6051086.0},
 {'TRANSACTION_ID': '504eb7fd-becf-b3c3-9cc9-53492c7b4bcc',
  'PRICE': 3785958.25},
 {'TRANSACTION_ID': 'a1a89478-d718-4df0-cc6c-bec4ef109e19',
  'PRICE': 1322990.875}]

## Post Results to API

In [90]:
#Post to Resights API.
import requests
r  = requests.post(
    url="https://api.resights.dk/hackathon/avm/ejerlejligheder/v1",
    json={
        "name": "Benjamin Waziri",
        "email": "202005735@post.au.dk",
        "predictions": predictions,
    },
)
print(r.json())

{'accuracy_5_perc': 0.4128, 'accuracy_10_perc': 0.6828, 'accuracy_20_perc': 0.9148}


## Result Log

Massive unreadable dump of results in chronological order (starting with my first attempts):

linear model performance: {'accuracy_5_perc': 0.1118, 'accuracy_10_perc': 0.2238, 'accuracy_20_perc': 0.4444}
random forest performance: {'accuracy_5_perc': 0.3182, 'accuracy_10_perc': 0.562, 'accuracy_20_perc': 0.8054}
random forest performance (200n): {'accuracy_5_perc': 0.3194, 'accuracy_10_perc': 0.5628, 'accuracy_20_perc': 0.8116}
from now on with lat, lng:
random forest performance (200n): {'accuracy_5_perc': 0.3624, 'accuracy_10_perc': 0.6194, 'accuracy_20_perc': 0.8764}
random forest performance (200n, 30 depth): {'accuracy_5_perc': 0.3606, 'accuracy_10_perc': 0.6216, 'accuracy_20_perc': 0.8786}
Now with month:
random forest performance (200n, 30 depth) {'accuracy_5_perc': 0.3588, 'accuracy_10_perc': 0.624, 'accuracy_20_perc': 0.8808}
random forest performance (500n, 30 depth) {'accuracy_5_perc': 0.3612, 'accuracy_10_perc': 0.6246, 'accuracy_20_perc': 0.8808}
now with region:
random forest performance (500n, 30 depth) {'accuracy_5_perc': 0.3608, 'accuracy_10_perc': 0.6302, 'accuracy_20_perc': 0.8832}
now without region, but with mean sqm per municipality:
random forest performance (500n, 30 depth) {'accuracy_5_perc': 0.3624, 'accuracy_10_perc': 0.623, 'accuracy_20_perc': 0.8844}
best achieved with hist: {'accuracy_5_perc': 0.3162, 'accuracy_10_perc': 0.5886, 'accuracy_20_perc': 0.8526}
With street:
random forest performance (200n, 15 depth) {'accuracy_5_perc': 0.3324, 'accuracy_10_perc': 0.6116, 'accuracy_20_perc': 0.8866}
random forest performance (300n, 20 depth) {'accuracy_5_perc': 0.3564, 'accuracy_10_perc': 0.6344, 'accuracy_20_perc': 0.8934}
random forest performance (500n, 30 depth) {'accuracy_5_perc': 0.3566, 'accuracy_10_perc': 0.6414, 'accuracy_20_perc': 0.8928}
random forest performance (400n, 25 depth) {'accuracy_5_perc': 0.358, 'accuracy_10_perc': 0.639, 'accuracy_20_perc': 0.893}
Decreasing number of floor options:
random forest performance (400n, 25 depth) {'accuracy_5_perc': 0.3614, 'accuracy_10_perc': 0.6456, 'accuracy_20_perc': 0.8948}
Adding years since construction:
random forest performance (400n, 25 depth) {'accuracy_5_perc': 0.3676, 'accuracy_10_perc': 0.6436, 'accuracy_20_perc': 0.8956}
stacking: {'accuracy_5_perc': 0.3364, 'accuracy_10_perc': 0.6248, 'accuracy_20_perc': 0.8868}
Now trying xgboost with log:
xgboost performance (500n, No depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.3592, 'accuracy_10_perc': 0.6518, 'accuracy_20_perc': 0.8966}
xgboost performance (500n, 10 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.403, 'accuracy_10_perc': 0.6768, 'accuracy_20_perc': 0.9124}
xgboost performance (500n, 20 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.377, 'accuracy_10_perc': 0.6322, 'accuracy_20_perc': 0.8908} WORSE
xgboost performance (500n, 8 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.3962, 'accuracy_10_perc': 0.6682, 'accuracy_20_perc': 0.9106} WORSE
xgboost performance (500n, 12 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.3988, 'accuracy_10_perc': 0.666, 'accuracy_20_perc': 0.9078}WORSE
xgboost performance (500n, 10 depth, learning rate 0.05, squarederror, subsample 0.9, colsample 0.8) {'accuracy_5_perc': 0.391, 'accuracy_10_perc': 0.676, 'accuracy_20_perc': 0.911}WORSE

xgboost performance (500n, 10 depth, learning rate 0.05, squarederror, subsample 0.9, colsample 0.8){'accuracy_5_perc': 0.3964, 'accuracy_10_perc': 0.6812, 'accuracy_20_perc': 0.9112}
xgboost performance (700n, 10 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4082, 'accuracy_10_perc': 0.68, 'accuracy_20_perc': 0.9128}
xgboost performance (800n, 10 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.411, 'accuracy_10_perc': 0.6774, 'accuracy_20_perc': 0.912}
xgboost performance (900n, 10 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4118, 'accuracy_10_perc': 0.6768, 'accuracy_20_perc': 0.9124}
xgboost performance (1000n, 10 depth, learning rate 0.1, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4076, 'accuracy_10_perc': 0.6736, 'accuracy_20_perc': 0.9114}
ensemble: {'accuracy_5_perc': 0.4092, 'accuracy_10_perc': 0.6794, 'accuracy_20_perc': 0.9124}
xgboost performance (900n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4102, 'accuracy_10_perc': 0.6798, 'accuracy_20_perc': 0.9148} 
xgboost performance (1000n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4116, 'accuracy_10_perc': 0.6812, 'accuracy_20_perc': 0.9128}
xgboost performance (1000n, 10 depth, learning rate 0.07, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.3996, 'accuracy_10_perc': 0.676, 'accuracy_20_perc': 0.914}
xgboost performance (900n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4102, 'accuracy_10_perc': 0.6798, 'accuracy_20_perc': 0.9148}
xgboost performance (920n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.41, 'accuracy_10_perc': 0.6784, 'accuracy_20_perc': 0.9152} 
xgboost performance (880n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4118, 'accuracy_10_perc': 0.681, 'accuracy_20_perc': 0.914} 
xgboost performance (860n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4128, 'accuracy_10_perc': 0.6834, 'accuracy_20_perc': 0.9146} 
xgboost performance (840n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4128, 'accuracy_10_perc': 0.6828, 'accuracy_20_perc': 0.9148} BEST 5%
xgboost performance (850n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4126, 'accuracy_10_perc': 0.684, 'accuracy_20_perc': 0.9146}
xgboost performance (840n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.75) {'accuracy_5_perc': 0.3978, 'accuracy_10_perc': 0.674, 'accuracy_20_perc': 0.9152}
xgboost performance (840n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.85) {'accuracy_5_perc': 0.4048, 'accuracy_10_perc': 0.6816, 'accuracy_20_perc': 0.9138}
xgboost performance (840n, 10 depth, learning rate 0.08, squarederror, subsample 0.85, colsample 0.8) {'accuracy_5_perc': 0.4004, 'accuracy_10_perc': 0.677, 'accuracy_20_perc': 0.9132}
xgboost performance (840n, 10 depth, learning rate 0.08, squarederror, subsample 0.75, colsample 0.8) {'accuracy_5_perc': 0.4002, 'accuracy_10_perc': 0.6708, 'accuracy_20_perc': 0.9104}
xgboost performance (840n, 10 depth, learning rate 0.075, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4008, 'accuracy_10_perc': 0.6882, 'accuracy_20_perc': 0.914} BEST 10%
xgboost performance (860n, 10 depth, learning rate 0.075, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.3996, 'accuracy_10_perc': 0.6868, 'accuracy_20_perc': 0.9138}
xgboost performance (1200n, 10 depth, learning rate 0.05, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.404, 'accuracy_10_perc': 0.6808, 'accuracy_20_perc': 0.9182} BEST 20%
xgboost performance (1300n, 10 depth, learning rate 0.05, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4034, 'accuracy_10_perc': 0.681, 'accuracy_20_perc': 0.9168}

xgboost performance (1400n, 10 depth, learning rate 0.05, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4024, 'accuracy_10_perc': 0.6824, 'accuracy_20_perc': 0.9168}
xgboost performance (1500n, 10 depth, learning rate 0.04, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.4036, 'accuracy_10_perc': 0.6846, 'accuracy_20_perc': 0.9172}
xgboost performance (1600n, 10 depth, learning rate 0.04, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.406, 'accuracy_10_perc': 0.685, 'accuracy_20_perc': 0.9178}
xgboost performance (2200n, 10 depth, learning rate 0.03, squarederror, subsample 0.8, colsample 0.8) {'accuracy_5_perc': 0.41, 'accuracy_10_perc': 0.6852, 'accuracy_20_perc': 0.9154}

ALPHA, LAMBDA EXPERIMENTATION:
xgboost performance (1200n, 10 depth, learning rate 0.05, squarederror, subsample 0.8, colsample 0.8, alpha 0.5) {'accuracy_5_perc': 0.4028, 'accuracy_10_perc': 0.6848, 'accuracy_20_perc': 0.9128}
xgboost performance (1200n, 10 depth, learning rate 0.05, squarederror, subsample 0.8, colsample 0.8, alpha 0.0, lambda 1.5) {'accuracy_5_perc': 0.3956, 'accuracy_10_perc': 0.6872, 'accuracy_20_perc': 0.9142}
xgboost performance (1200n, 10 depth, learning rate 0.05, squarederror, subsample 0.8, colsample 0.8, alpha 0.0, lambda 0.5) {'accuracy_5_perc': 0.4094, 'accuracy_10_perc': 0.6836, 'accuracy_20_perc': 0.9156}
xgboost performance (840n, 10 depth, learning rate 0.08, squarederror, subsample 0.8, colsample 0.8, alpha 0.0, lambda 1.1) {'accuracy_5_perc': 0.404, 'accuracy_10_perc': 0.6794, 'accuracy_20_perc': 0.914} (lambda 0.9 similar, so 1.0 seems to be optimal for lambda)






