# Machine Learning 1 - Regression on appartments dataset
### Zofia Broszczak

# Packages and data import

In [44]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

import pickle

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

from scipy.stats import chi2_contingency

from sklearn.preprocessing import StandardScaler

from statsmodels.api import OLS, add_constant
from statsmodels.tools.eval_measures import aic, bic
from sklearn.linear_model import LinearRegression

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer

from google.colab import files

In [45]:
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/My Drive/ML1_2024_2025/_assessment_project'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1eN-piwiant12j137Bx8TLRcvv0ejIF_w/ML1_2024_2025/_assessment_project


In [46]:
appartments_train = pd.read_csv("data/appartments_train.csv")
appartments_test = pd.read_csv("data/appartments_test.csv")

# Data preparation

## Replacing missing values

In [47]:
appartments_train['obj_type'] = appartments_train['obj_type'].fillna('None')
appartments_test['obj_type'] = appartments_test['obj_type'].fillna('None')

appartments_train['build_mat'] = appartments_train['build_mat'].fillna('None') # material is missing in 40% of observations
appartments_test['build_mat'] = appartments_test['build_mat'].fillna('None')

In [48]:
appartments_train = appartments_train.drop(columns=["cond_class"]) # cond_class is missing in 75% of observations
appartments_test = appartments_test.drop(columns=["cond_class"])


appartments_train = appartments_train.drop(columns=["unit_id"]) # unit_id is unhelpful when modelling

unit_ids = appartments_test['unit_id'].copy() #copying for predictions
appartments_test = appartments_test.drop(columns=["unit_id"])


In [49]:
appartments_train['floor_max'] = appartments_train['floor_max'].fillna(appartments_train['floor_max'].median())
appartments_test['floor_max'] = appartments_test['floor_max'].fillna(appartments_test['floor_max'].median())

appartments_train['dist_sch'] = appartments_train['dist_sch'].fillna(appartments_train['dist_sch'].median())
appartments_test['dist_sch'] = appartments_test['dist_sch'].fillna(appartments_test['dist_sch'].median())

appartments_train['dist_clinic'] = appartments_train['dist_clinic'].fillna(appartments_train['dist_clinic'].median())
appartments_test['dist_clinic'] = appartments_test['dist_clinic'].fillna(appartments_test['dist_clinic'].median())

appartments_train['dist_post'] = appartments_train['dist_post'].fillna(appartments_train['dist_post'].median())
appartments_test['dist_post'] = appartments_test['dist_post'].fillna(appartments_test['dist_post'].median())

appartments_train['dist_kind'] = appartments_train['dist_kind'].fillna(appartments_train['dist_kind'].median())
appartments_test['dist_kind'] = appartments_test['dist_kind'].fillna(appartments_test['dist_kind'].median())

appartments_train['dist_rest'] = appartments_train['dist_rest'].fillna(appartments_train['dist_rest'].median())
appartments_test['dist_rest'] = appartments_test['dist_rest'].fillna(appartments_test['dist_rest'].median())

appartments_train['dist_pharma'] = appartments_train['dist_pharma'].fillna(appartments_train['dist_pharma'].median())
appartments_test['dist_pharma'] = appartments_test['dist_pharma'].fillna(appartments_test['dist_pharma'].median())

In [50]:
lift_mode = appartments_train['has_lift'].mode().iloc[0]

appartments_train['has_lift'] = appartments_train['has_lift'].fillna(lift_mode)
appartments_test['has_lift'] = appartments_test['has_lift'].fillna(lift_mode)

In [51]:
bins = [0, 5, 10, 15, 20, 30, 100]
labels = ['0-5', '6-10', '11-15', '16-20', '21-30', '31+']

appartments_train['floor_max_bin'] = pd.cut(appartments_train['floor_max'], bins=bins, labels=labels)

floor_no_by_bin = appartments_train.groupby('floor_max_bin', observed=True)['floor_no'].median()

appartments_train['floor_no'] = appartments_train.apply(lambda row: floor_no_by_bin[row['floor_max_bin']] if pd.isna(row['floor_no']) else row['floor_no'], axis=1)
appartments_train.drop(columns=['floor_max_bin'], inplace=True)

appartments_test['floor_max_bin'] = pd.cut(appartments_test['floor_max'], bins=bins, labels=labels)
appartments_test['floor_no'] = appartments_test.apply(lambda row: floor_no_by_bin[row['floor_max_bin']] if pd.isna(row['floor_no']) else row['floor_no'], axis=1)
appartments_test.drop(columns=['floor_max_bin'], inplace=True)

In [52]:
year_built_by_loc = appartments_train.groupby('loc_code')['year_built'].median()
year_median = appartments_train['year_built'].median()

appartments_train['year_built'] = appartments_train['year_built'].fillna(appartments_train['loc_code'].map(year_built_by_loc)).fillna(year_median)
appartments_test['year_built'] = appartments_test['year_built'].fillna(appartments_test['loc_code'].map(year_built_by_loc)).fillna(year_median)

dist_uni_by_loc = appartments_train.groupby('loc_code')['dist_uni'].median()
global_dist_uni_median = appartments_train['dist_uni'].median()

appartments_train['dist_uni'] = appartments_train['dist_uni'].fillna(appartments_train['loc_code'].map(dist_uni_by_loc)).fillna(global_dist_uni_median)
appartments_test['dist_uni'] = appartments_test['dist_uni'].fillna(appartments_test['loc_code'].map(dist_uni_by_loc)).fillna(global_dist_uni_median)

infra_by_year = appartments_train.groupby('year_built')['infrastructure_quality'].median()
global_infra_median = appartments_train['infrastructure_quality'].median()

appartments_train['infrastructure_quality'] = appartments_train['infrastructure_quality'].fillna(appartments_train['year_built'].map(infra_by_year)).fillna(global_infra_median)
appartments_test['infrastructure_quality'] = appartments_test['infrastructure_quality'].fillna(appartments_test['year_built'].map(infra_by_year)).fillna(global_infra_median)

## Distribution of nominal variables (changing rare categories to "other")
 we will replace all the levels which have no more than 1565 observations (ca. 1% of the total sample) with the label "other".

In [53]:
nominal_variables = [
    "obj_type",
    "own_type",
    "build_mat",
    "has_park",
    "has_balcony",
    "has_lift",
    "has_sec",
    "has_store",
    "loc_code"
]

rare_threshold = 1565 # < 1% od all observations

value_counts = appartments_train['loc_code'].value_counts()
rare_levels = value_counts[value_counts <= rare_threshold].index

appartments_train['loc_code'] = appartments_train['loc_code'].replace(rare_levels, 'Other')
appartments_test['loc_code'] = appartments_test['loc_code'].replace(rare_levels, 'Other')

In [54]:
appartments_train['price_z_log'] = np.log1p(appartments_train['price_z'])

## Log transforming some numeric variables

In [55]:
numeric_variables = appartments_train.select_dtypes(include=[np.number]).columns

In [56]:
log_features = ["market_volatility", "n_rooms", "dim_m2", "estimated_maintenance_cost", "n_poi","dist_centre", "dist_rest", "dist_clinic", 'infrastructure_quality']

for col in log_features:
    appartments_train[col] = np.log1p(appartments_train[col])
    appartments_test[col] = np.log1p(appartments_test[col])


# Binning selected numeric variables

In [57]:
appartments_train['floor_no_binned'] = pd.cut(appartments_train['floor_no'], bins=[0, 3, 7, 15, 100],labels=["Low", "Mid", "High", "Very High"])
appartments_test['floor_no_binned'] = pd.cut(appartments_test['floor_no'], bins=[0, 3, 7, 15, 100],labels=["Low", "Mid", "High", "Very High"])

appartments_train = appartments_train.drop(columns=["floor_no"])
appartments_test = appartments_test.drop(columns=["floor_no"])

In [58]:
appartments_train['floor_max_binned'] = pd.cut(appartments_train['floor_max'], bins=[0, 4, 10, 100],labels=["Low-rise", "Mid-rise", "High-rise"])
appartments_test['floor_max_binned'] = pd.cut(appartments_test['floor_max'], bins=[0, 4, 10, 100],labels=["Low-rise", "Mid-rise", "High-rise"])


appartments_train = appartments_train.drop(columns=["floor_max"])
appartments_test = appartments_test.drop(columns=["floor_max"])

In [59]:
appartments_train['year_binned'] = pd.cut(appartments_train['year_built'], bins=[1800, 1950, 1980, 2000, 2015, 2024],labels=["<1950", "1950–1980", "1980–2000", "2000–2015", "2015–2024"])
appartments_test['year_binned'] = pd.cut(appartments_test['year_built'], bins=[1800, 1950, 1980, 2000, 2015, 2024],labels=["<1950", "1950–1980", "1980–2000", "2000–2015", "2015–2024"])


appartments_train = appartments_train.drop(columns=["year_built"])
appartments_test = appartments_test.drop(columns=["year_built"])

# Standardizing continuous numerical features​


In [60]:
# Scaling only float features
float_cols = appartments_train.select_dtypes(include='float64').columns
float_cols = [col for col in float_cols if col != 'price_z' and col != 'price_z_log']

scaler = StandardScaler()
appartments_train.loc[:, float_cols] = scaler.fit_transform(appartments_train[float_cols])
appartments_test.loc[:, float_cols] = scaler.transform(appartments_test[float_cols])

# Encoding ordinal variables





In [61]:
appartments_train_encoded = appartments_train.copy()
appartments_test_encoded = appartments_test.copy()


ordinal_categories = [["Low", "Mid", "High", "Very High"], ["Low-rise", "Mid-rise", "High-rise"], ["<1950", "1950–1980", "1980–2000", "2000–2015", "2015–2024"]]

ordinal_variables_to_transform = ["floor_no_binned", "floor_max_binned", "year_binned"]

encoder_ord = OrdinalEncoder(categories=ordinal_categories)

appartments_train_encoded[ordinal_variables_to_transform] = encoder_ord.fit_transform(appartments_train[ordinal_variables_to_transform])

appartments_test_encoded[ordinal_variables_to_transform] = encoder_ord.transform(appartments_test[ordinal_variables_to_transform])

month_mapping = {'2023-08': 1, '2023-09': 2, '2023-10': 3, '2023-11': 4, '2023-12': 5, '2024-01': 6, '2024-02': 7, '2024-03': 8, '2024-04': 9, '2024-05': 10, '2024-06': 11}

appartments_train_encoded['src_month'] = appartments_train['src_month'].map(month_mapping)
appartments_test_encoded['src_month'] = appartments_test['src_month'].map(month_mapping)


#Encoding nominal variables

In [62]:
nominal_variables_left = [var for var in nominal_variables]

In [63]:
appartments_train_encoded = pd.get_dummies(appartments_train_encoded, columns = nominal_variables_left, drop_first = True, dtype = int)
appartments_test_encoded = pd.get_dummies(appartments_test_encoded, columns = nominal_variables_left, drop_first = True, dtype = int)

appartments_test_encoded = appartments_test_encoded.reindex(columns=appartments_train_encoded.columns,fill_value=0) # to ensure that the one-hot encoded test dataset has the exact same columns as the training dataset.

# Feature selection

## Quantitative explanatory variables - correlations

In [64]:
appartments_numeric_columns = appartments_train.select_dtypes(include=[np.number]).columns

appartments_correlations = appartments_train[appartments_numeric_columns].corr(method = 'pearson')

correlation_with_price_z = appartments_correlations['price_z_log'].sort_values(ascending = False)

In [65]:
appartments_selected_numeric_vars = correlation_with_price_z[correlation_with_price_z.abs() >= 0.03].index.tolist()

if 'price_z' in appartments_selected_numeric_vars:
    appartments_selected_numeric_vars.remove('price_z')
if 'price_z_log' in appartments_selected_numeric_vars:
    appartments_selected_numeric_vars.remove('price_z_log')

## Qualitative (categorical) variables - correlations (nominal + ordinal)

In [66]:
appartments_categorical_variables = appartments_train.select_dtypes(include=["category", "object"]).columns
appartments_selected_categorical_vars = appartments_categorical_variables.to_list()
appartments_selected_categorical_vars.remove('has_lift')

In [67]:
encoded_cols = appartments_train_encoded.columns

dummy_cols = []
for cat_var in appartments_selected_categorical_vars:
    matched = [col for col in encoded_cols if col.startswith(f"{cat_var}_")]
    dummy_cols.extend(matched)
appartments_selected_vars = appartments_selected_numeric_vars + dummy_cols

# Linear regression


In [68]:
y_train = appartments_train_encoded['price_z_log']
X_train = sm.add_constant(appartments_train_encoded[appartments_selected_vars])
X_test = sm.add_constant(appartments_test_encoded[appartments_selected_vars])

In [69]:
 appartments_model_full = sm.OLS(y_train, X_train).fit()
 print(appartments_model_full.summary())

                            OLS Regression Results                            
Dep. Variable:            price_z_log   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                 9.546e+04
Date:                Sat, 31 May 2025   Prob (F-statistic):               0.00
Time:                        19:52:23   Log-Likelihood:             1.2875e+05
No. Observations:              156454   AIC:                        -2.574e+05
Df Residuals:                  156418   BIC:                        -2.571e+05
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

## Evaluating models using cross-validation.

In [70]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_squared_error
# import numpy as np

# def cross_val_ols(X, y, k=10):
#     """
#     Perform k-fold cross-validation using OLS from statsmodels.
#     Assumes X already includes a constant.

#     Parameters:
#         X (DataFrame): Feature matrix with constant included.
#         y (Series): Target vector.
#         k (int): Number of folds (default: 5)

#     Returns:
#         float: Mean RMSE across all folds
#     """
#     kf = KFold(n_splits=k, shuffle=True, random_state=42)
#     rmse_scores = []

#     for train_idx, val_idx in kf.split(X):
#         X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
#         y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]

#         model = sm.OLS(y_train_cv, X_train_cv).fit()
#         log_preds = model.predict(X_val_cv)

#         # Back-transform both predictions and targets
#         preds = np.exp(log_preds) - 1
#         true_vals = np.exp(y_val_cv) - 1

#         rmse = np.sqrt(mean_squared_error(true_vals, preds))
#         rmse_scores.append(rmse)

#     return np.mean(rmse_scores)

In [71]:
# # Full model CV
# rmse_cv_full = cross_val_ols(X_train, y_train)
# print("CV RMSE (Full):", rmse_cv_full)

CV RMSE (Full): 95975.33495059048


#Predictions

In [72]:
y_pred_log = appartments_model_full.predict(X_test)
y_pred_real = np.expm1(y_pred_log)

predictions_csv = pd.DataFrame({
    'unit_id': unit_ids.values,
    'prediction': y_pred_real
})

csv_path = "/content/ols_predictions_final.csv"
predictions_csv.to_csv(csv_path, index=False)
print("Predictions saved to:", csv_path)

from google.colab import files
files.download(csv_path)


Predictions saved to: /content/ols_predictions_final.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>