<a href="https://colab.research.google.com/github/viktorjovev/conformal_prediction_regression_modeling/blob/main/conf_pred_real_estate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
quantbruce_real_estate_price_prediction_path = kagglehub.dataset_download('quantbruce/real-estate-price-prediction')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd


import os
for dirname, _, filenames in os.walk(quantbruce_real_estate_price_prediction_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install mapie

In [None]:
def fast_mwis_score(y_true, lower, upper, alpha):
    assert y_true.ndim == 1, "y_true: pandas Series or 1D array expected"
    assert lower.ndim == 1, "lower: pandas Series or 1D array expected"
    assert upper.ndim == 1, "upper: pandas Series or 1D array expected"
    assert isinstance(alpha, float), "alpha: float expected"
    assert (lower <= upper).all(), ("lower must be <= upper",
                                    lower[lower > upper],
                                    upper[lower > upper])

    total_interval_width = upper.sum() - lower.sum()
    error_above = (y_true - upper)[y_true > upper].sum()
    error_below = (lower - y_true)[y_true < lower].sum()
    total_error = error_above + error_below
    mwis = (total_interval_width + total_error * 2 / alpha) / len(y_true)
    below = (y_true < lower).mean()
    above = (upper < y_true).mean()
    coverage = ((lower <= y_true) & (y_true <= upper)).mean()
    return mwis, below, coverage, above

In [None]:
df = pd.read_csv('/root/.cache/kagglehub/datasets/quantbruce/real-estate-price-prediction/versions/1/Real estate.csv',index_col="No")
df.head()

In [None]:
df = df[df['Y house price of unit area'] < 115]

In [None]:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

x = df['Y house price of unit area'].dropna()

bins = np.linspace(x.min(), x.max(), 30)

fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(x, bins=bins, alpha=0.5, color='blue')
ax.grid(True, which='both', linestyle='--', linewidth=0.5, color='gray')

ax.set_xlabel('Цена на куќа по единица површина', fontsize=12)
ax.set_ylabel('Број', fontsize=12)
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(16, 12))
gs = gridspec.GridSpec(2, 2, figure=fig)

ax1 = fig.add_subplot(gs[0, 0])
ax1.scatter(df['X2 house age'], df['Y house price of unit area'], color='blue', alpha=0.7)
ax1.set_xlabel('X2 - Старост на куќа', fontsize=16)
ax1.set_ylabel('Y - Цена на куќа по единица површинa', fontsize=16)
ax1.tick_params(axis='both', labelsize=14)
ax1.legend(fontsize=14)
ax1.grid(True)

ax2 = fig.add_subplot(gs[0, 1])
ax2.scatter(df['X3 distance to the nearest MRT station'], df['Y house price of unit area'], color='green', alpha=0.7)
ax2.set_xlabel('X3 - Најмало растојание до транспортен систем', fontsize=16)
ax2.tick_params(axis='both', labelsize=14)
ax2.legend(fontsize=14)
ax2.grid(True)

ax3 = fig.add_subplot(gs[1, :])
ax3.scatter(df['X4 number of convenience stores'], df['Y house price of unit area'], color='red', alpha=0.7)
ax3.set_xlabel('X4 - Број на погодни продавници', fontsize=16)
ax3.tick_params(axis='both', labelsize=14)
ax3.legend(fontsize=14)
ax3.grid(True)


plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
labels = ["ниска", "средна", "висока", "луксузна"]
df["класа"] = pd.cut(df["Y house price of unit area"], bins=4, labels=labels)

sns.scatterplot(x="X5 latitude", y="X6 longitude", hue="класа", data=df, s=100)
plt.xlabel("X5 - географска ширина")
plt.ylabel("X6 - географска должина")
plt.grid()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(16, 12))
gs = gridspec.GridSpec(2, 2, figure=fig)

ax1 = fig.add_subplot(gs[0, 0])
sns.lineplot(x="X3 distance to the nearest MRT station", y="Y house price of unit area", data=df, hue="класа", ax=ax1)
ax1.set_xlabel("X3 - Најмало растојание до транспортен систем", fontsize=16)
ax1.set_ylabel("Y - Цена на куќа по единица површинa", fontsize=16)
ax1.tick_params(axis='both', labelsize=14)
ax1.legend(fontsize=14)
ax1.grid(True)

ax2 = fig.add_subplot(gs[0, 1])
sns.lineplot(x="X4 number of convenience stores", y="Y house price of unit area", data=df, hue="класа", ax=ax2)
ax2.set_xlabel("X4 - број на погодни продавници", fontsize=16)
ax2.set_ylabel("", fontsize=16)
ax2.tick_params(axis='both', labelsize=14)
ax2.legend(fontsize=14)
ax2.grid(True)

ax3 = fig.add_subplot(gs[1, :])
sns.lineplot(x="X1 transaction date", y="Y house price of unit area", data=df, hue="класа", ax=ax3)
ax3.set_xlabel("X1 - Датум на трансакција", fontsize=16)
ax3.set_ylabel("", fontsize=16)
ax3.tick_params(axis='both', labelsize=14)
ax3.legend(fontsize=14)
ax3.grid(True)

plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from mapie.regression import MapieRegressor

from sklearn.model_selection import KFold

skf = KFold(n_splits = 5,  random_state = 42, shuffle= True)
X = df[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']]
y = df['Y house price of unit area']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

knn = KNeighborsRegressor(n_neighbors=6, p=1, metric='manhattan')

knn.fit(X_train, y_train)

mapie = MapieRegressor(estimator=knn, cv = skf, method="naive")

mapie.fit(X_train, y_train)


In [None]:
y_pred = knn.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

In [None]:
y_pred, y_pis = mapie.predict(X_train, alpha=0.1)

for i in range(int(len(y_pred)*0.05)):
    print(f"Prediction: {y_pred[i]:.2f}, Lower Bound: {y_pis[i][0][0]:.2f}, Upper Bound: {y_pis[i][1][0]:.2f}")

In [None]:
lower_bounds = np.concatenate([array[:][0] for array in y_pis])
upper_bounds = np.concatenate([array[:][1] for array in y_pis])
fast_mwis_score(y_train, lower_bounds, upper_bounds, 0.1)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_pred, color="blue", alpha=0.6)
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color="red", linestyle="--")
plt.xlabel("Вистинските редности")
plt.ylabel("Предвидените вредности")


plt.tight_layout()
plt.show()


In [None]:
y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

In [None]:
y_pred, y_pis = mapie.predict(X_test, alpha=0.1)

for i in range(int(len(y_pred)*0.1)):
    print(f"Prediction: {y_pred[i]:.2f}, Lower Bound: {y_pis[i][0][0]:.2f}, Upper Bound: {y_pis[i][1][0]:.2f}")

In [None]:
lower_bounds = np.concatenate([array[:][0] for array in y_pis])
upper_bounds = np.concatenate([array[:][1] for array in y_pis])

fast_mwis_score(y_test, lower_bounds, upper_bounds,0.1)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, color="blue", alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--")
plt.xlabel("Вистинските редности")
plt.ylabel("Предвидените вредности")


plt.tight_layout()
plt.show()