In [1]:
import sys
import os

sys.path.append(os.path.abspath(".."))
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Convert dataset with OneHotEncoder


In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


def prepare_features(x_data_frame: pd.DataFrame):
    """
    X_train = vstupní featury (normalizované, zakódované, připravené)
    y_train = cílová hodnota (ve tvém případě MSSubClass)
    transformer = pipeline pro použití na testovacích datech
    """
    X = x_data_frame.copy()

    # # 1. Rozdělení cílové proměnné a vstupů
    # y = df[target_column]

    # 2. Rozdělení sloupců
    numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

    # 3. Tvorba transformací
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_cols),
            (
                "cat",
                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                categorical_cols,
            ),
        ]
    )

    # 4. Pipeline (můžeš použít i s modely jako XGBoost, NN atd.)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

    # 5. Fit + transform vstupní data
    X_transformed = pipeline.fit_transform(X)

    # 6. Získání názvů nových sloupců (volitelné)
    new_columns = []

    if numerical_cols:
        new_columns += numerical_cols

    if categorical_cols:
        encoder = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
        new_columns += encoder.get_feature_names_out(categorical_cols).tolist()

    X_transformed_df = pd.DataFrame(X_transformed, columns=new_columns, index=X.index)

    return X_transformed_df, pipeline  # X, y, transformer pro testovací data

In [3]:
path_train_data = "data_set/train.csv"

# origin_data_df = utils.load_csv_data(path_train_data)
origin_data_df = utils.load_csv_data(path_train_data)
# Debugging: Check the type of train_data

# Print the loaded data
if isinstance(origin_data_df, pd.DataFrame):
    print(f"dimension train data: {origin_data_df.shape}")
    print(f"dimension train data: {type(origin_data_df)}")
else:
    print("train_data is not a DataFrame")

Successfully loaded data from data_set/train.csv
dimension train data: (1460, 81)
dimension train data: <class 'pandas.core.frame.DataFrame'>


create train target data


In [None]:
# Get train data
train_target_df = origin_data_df["SalePrice"]

# Drop the Id and sales price column
train_data_df = origin_data_df.drop(columns=["Id"]).drop(columns=["SalePrice"])
train_data_df = utils.auto_clean_columns(train_data_df)


print(f"train_data_df dataType:{type(train_data_df)}")
print(f"train_target_df dataType:{type(train_target_df)}")
print(f"train_data_df shape: {train_data_df.shape}")
print(f"train_target_df shape: {train_target_df.shape}")

- X_train = vstupní featury (normalizované, zakódované, připravené)
- y_train = cílová hodnota (ve tvém případě MSSubClass)
- transformer = pipeline pro použití na testovacích datech


## Prepare train


In [None]:
train_data_OneHotEnc, transformer = prepare_features(train_data_df)
# print(transformer)
print(f"x_train_OneHotEnc shape: {train_data_OneHotEnc.shape}")
print(type(train_data_OneHotEnc))

In [None]:
print(list(train_data_df.columns))
print("-------------")
print(list(train_data_OneHotEnc.columns))

## Prepare test data with transformer


In [None]:
path_test_data = "data_set/test.csv"

test_data = utils.load_csv_data(path_test_data)
# Debugging: Check the type of test_data

# Print the loaded data
if isinstance(test_data, pd.DataFrame):
    print(f"dimension train data: {test_data.shape}")
else:
    print("test_data is not a DataFrame")
print(type(test_data))

In [None]:
test_data = test_data.drop(columns=["Id"])
print(test_data.shape)
test_data = utils.auto_clean_columns(test_data)
# test_data = test_data.replace("NA", np.nan)
print(type(test_data))

In [None]:
x_target = transformer.transform(test_data)
print(type(x_target))

prepare test targets


In [None]:
path_target_data = "data_set/sample_submission.csv"

target_data = utils.load_csv_data(path_target_data)
# Debugging: Check the type of test_data

# Print the loaded data
if isinstance(target_data, pd.DataFrame):
    print(f"dimension train data: {target_data.shape}")
else:
    print("test_data is not a DataFrame")

In [None]:
y_target = target_data.drop(columns=["Id"])

In [None]:
print(x_target.shape)
print(y_target.shape)
print(type(x_target))
print(type(y_target))

In [None]:
# # Se jmény sloupců z transformace
# num_cols = transformer.named_steps["preprocessor"].transformers_[0][2]
# cat_encoder = transformer.named_steps["preprocessor"].named_transformers_["cat"]
# cat_cols = cat_encoder.get_feature_names_out(
#     transformer.named_steps["preprocessor"].transformers_[1][2]
# )

# all_cols = list(num_cols) + list(cat_cols)

# # Převod na DataFrame
# import pandas as pd

# x_target = pd.DataFrame(x_target, columns=all_cols, index=test_data.index)
# print(type(y_target))

# Create and Train model


## K Fold validation


In [None]:
import keras
import time

start = time.time()


def build_model(input_parameters):
    input_shape = (input_parameters,)  # 304 parameters of house
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(838, activation="relu", input_shape=input_shape))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(1600, activation="relu"))
    model.add(keras.layers.Dense(1))  # regresní výstup
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


k = 1
num_val_samples = len(x_train) // k
num_epochs = 10
all_scores = []
all_histories = []


for i in range(k):
    print(f"processing fold # {i}")
    val_data = x_train[i * num_val_samples : (i + 1) * num_val_samples]
    val_targets = y_train[i * num_val_samples : (i + 1) * num_val_samples]
    partial_train_data = pd.concat(
        [x_train[: i * num_val_samples], x_train[(i + 1) * num_val_samples :]]
    )
    partial_train_targets = pd.concat(
        [y_train[: i * num_val_samples], y_train[(i + 1) * num_val_samples :]]
    )

    model = build_model(x_train.shape[1])
    print("- 🧠 Spouštím trénování...")
    history = model.fit(
        partial_train_data,
        partial_train_targets,
        validation_data=(val_data, val_targets),
        epochs=num_epochs,
        batch_size=64,
        verbose=0,
    )

    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
    print(f"- ✅ Fold {i + 1} finished: val_mae = {val_mae:.2f}")
    _ = utils.plot_history(history.history, metric="mae")
    all_scores.append(val_mae)
    all_histories.append(history)

print(all_scores)
print(f"🕒 Celkový čas trénování: {time.time() - start:.2f} s")

## Train on whole data


In [None]:
# Clean
origin_data_df_clean = utils.auto_clean_columns(origin_data_df)

train_df, val_df = train_test_split(
    origin_data_df_clean, test_size=0.2, random_state=42
)


# y

y_train = utils.to_numpy(train_df["SalePrice"])
y_val = utils.to_numpy(val_df["SalePrice"])

y_train_log = np.log1p(y_train)  # log(1 + y)
y_val_log = np.log1p(y_val)


# X (bez 'Id' a 'SalePrice')
train_data = train_df.drop(columns=["Id", "SalePrice"])
val_data = val_df.drop(columns=["Id", "SalePrice"])

# Feature preparation
x_train, transformer = prepare_features(train_data)
x_val = transformer.transform(val_data)

x_train = utils.to_numpy(x_train)
x_val = utils.to_numpy(x_val)

variables = {
    "x_train": x_train,
    "y_train": y_train,
    "x_val": x_val,
    "y_val": y_val,
}

for name, var in variables.items():
    print(f"{name}:")
    print(f"  type: {type(var)}")
    print(f"  shape: {var.shape}")

print(np.isnan(x_val).sum())  # musí být 0
print(np.isnan(x_train).sum())  # musí být 0

In [None]:
import keras
import time

start = time.time()


def build_model(input_parameters):
    input_shape = (input_parameters,)  # 304 parameters of house
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(128, activation="relu", input_shape=input_shape))
    model.add(keras.layers.BatchNormalization())
    # model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dense(1))  # regresní výstup
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


num_epochs = 10

model = build_model(x_train.shape[1])
history = model.fit(
    x_train,
    y_train_log,
    epochs=num_epochs,
    batch_size=64,
    validation_data=(x_val, y_val_log),
    verbose=0,
)


val_mse, val_mae = model.evaluate(x_val, y_val_log, verbose=0)
print(f"- ✅ finished: val_mae = {val_mae:.2f}")
_ = utils.plot_history(history.history, metric="mae")
# _ = utils.plot_history(history.history, metric="loss")

print(val_mae)
print(np.expm1(val_mae))
print(f"🕒 Celkový čas trénování: {time.time() - start:.2f} s")
# Predikce ve val log-space
y_val_pred_log = model.predict(x_val)
y_val_pred = np.expm1(y_val_pred_log)
y_val_true = y_val

# MAE v původních jednotkách
from sklearn.metrics import mean_absolute_error

mae_usd = mean_absolute_error(y_val_true, y_val_pred)
print(f"🎯 Skutečný MAE v USD: {mae_usd:,.0f} USD")

In [None]:
model.save("house_prices_oneSotEncoder.keras")

Test on train data


In [None]:
print(history.history["loss"][-5:])
print(history.history["val_loss"][-5:])

In [None]:
def predict_debug(index: int):
    """
    Debug predikce na trénovacích datech podle původních vstupních hodnot.
    """
    # 1. Získání původních vstupních dat (neškálovaných, nesparsovaných)
    raw_input = train_data.iloc[[index]]  # DataFrame s jedním řádkem
    true_price = y_train[index]

    # 2. Transformace vstupu pomocí pipeline (StandardScaler + OneHotEncoder)
    x_input = transformer.transform(raw_input)

    # 3. Predikce
    predicted_price = model.predict(x_input, verbose=0)[0][0]
    predicted_price = np.expm1(predicted_price)
    # 4. Výpis
    print(f"🔍 Index: {index}")
    print(f"🎯 Skutečná cena: {true_price:,.0f} USD")
    print(f"🤖 Predikovaná cena: {predicted_price:,.0f} USD")
    print(f"📉 Rozdíl: {true_price - predicted_price:,.0f} USD")

    # Volitelně – výpis vstupních hodnot
    # print("\n🧾 Vstupní featury:")
    # display(raw_input.T)  # pokud jsi v notebooku


for i in [0, 20, 40, 60, 70, 80]:
    print(predict_debug(i))

In [None]:
import matplotlib.pyplot as plt

print(f"Min: {y_train.min():,.0f} USD")
print(f"Max: {y_train.max():,.0f} USD")
print(f"Mean: {y_train.mean():,.0f} USD")

max_diff = 0
min_diff = 0
count = 10  # nebo x_train.shape[0] podle potřeby

for i in range(count):
    features = x_train[i]
    prediction = model.predict(features, verbose=0)[0][0]
    true_value = y_train[i]

    diff = int(true_value - prediction)
    pred_int = int(prediction)
    true_int = int(true_value)

    if diff > max_diff:
        max_diff = diff
    elif diff < min_diff:
        min_diff = diff

    print(
        f"diff = {format(diff, ',')} USD | "
        f"prediction = {format(pred_int, ',')} USD | "
        f"target = {format(true_int, ',')} USD"
    )

print(
    f"\nmin_diff = {format(min_diff, ',')} USD |"
    f"\nmax_diff = {format(max_diff, ',')} USD |"
)

# Výpočet predikcí a graf
preds = [
    model.predict(x_train[i].reshape(1, -1), verbose=0)[0][0] for i in range(count)
]
trues = y_train[:count]

plt.figure(figsize=(10, 6))
plt.plot(trues, marker="o", label="Skutečné ceny")
plt.plot(preds, marker="o", label="Predikované ceny")
plt.title("Porovnání predikce vs. skutečnost (trénovací data)")
plt.xlabel("Index")
plt.ylabel("Cena (v USD)")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(x_train[:5])

In [None]:
print(np.std(x_train, axis=0))  # směrodatná odchylka po sloupcích

In [None]:
print(np.min(x_train, axis=0))
print(np.max(x_train, axis=0))
print(np.mean(x_train, axis=0))
print(np.std(x_train, axis=0))

# ruin model on target data


In [None]:
print(type(x_train_df))
print(type(y_train_df))
print("--- convert ---")
x_train = utils.to_numpy(x_train_df)
y_train = utils.to_numpy(y_train_df)
print(type(x_train))
print(type(y_train))
print(x_train.shape)
print(y_train.shape)

In [None]:
import matplotlib.pyplot as plt

print(f"Min: {y_target.min():,.0f} USD")
print(f"Max: {y_target.max():,.0f} USD")
print(f"Mean: {y_target.mean():,.0f} USD")

max_diff = 0
min_diff = 0
count = 10  # nebo x_target.shape[0] podle potřeby

for i in range(count):
    features = x_target[i].reshape(1, -1)  # bez .iloc
    prediction = model.predict(features, verbose=0)[0][0]
    target_value = y_target[i]

    diff = int(target_value - prediction)
    pred_int = int(prediction)
    target_int = int(target_value)

    if diff > max_diff:
        max_diff = diff
    elif diff < min_diff:
        min_diff = diff

    print(
        f"diff = {format(diff, ',')} USD | "
        f"prediction = {format(pred_int, ',')} USD | "
        f"target = {format(target_int, ',')} USD"
    )

print(
    f"\nmin_diff = {format(min_diff, ',')} USD |"
    f"\nmax_diff = {format(max_diff, ',')} USD |"
)

# Výpočet predikcí a vykreslení grafu
preds = [
    model.predict(x_target[i].reshape(1, -1), verbose=0)[0][0] for i in range(count)
]
trues = y_target[:count]

plt.figure(figsize=(10, 6))
plt.plot(trues, marker="o", label="Skutečné ceny")
plt.plot(preds, marker="o", label="Predikované ceny")
plt.title("Porovnání predikce vs. skutečnost")
plt.xlabel("Index")
plt.ylabel("Cena (v USD)")
plt.legend()
plt.grid(True)
plt.show()

# other


In [None]:
print("\n" + "=" * 50)
print("📊 Výsledky cross-validace:")
for i, score in enumerate(all_scores):
    print(f"Fold {i + 1}: MAE = {score:.2f}")

average = np.mean(all_scores)
print("-" * 50)
print(f"📈 Průměrná MAE přes {k} foldů: {average:.2f}")

epoch_maes = [
    [history.history["mae"][epoch] for history in all_histories]
    for epoch in range(num_epochs)
]
average_mae_history = [np.mean(epoch) for epoch in epoch_maes]
std_mae_history = [np.std(epoch) for epoch in epoch_maes]

plt.figure(figsize=(10, 6))
plt.plot(
    range(1, num_epochs + 1), average_mae_history, marker="o", label="Průměrná MAE"
)
plt.fill_between(
    range(1, num_epochs + 1),
    np.array(average_mae_history) - np.array(std_mae_history),
    np.array(average_mae_history) + np.array(std_mae_history),
    alpha=0.2,
    label="± 1 std",
)
plt.xlabel("Epoch")
plt.ylabel("Mean Absolute Error (MAE)")
plt.title("📈 Průměrná MAE s rozptylem (všechny foldy)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
for i in range(10):
    predictions = model.predict(x_target.iloc[[i]], verbose=0)
    target_value = y_target.iloc[i]

    diff = int(target_value - predictions[0][0])
    pred = int(predictions[0][0])
    target = int(target_value)

    print(
        f"diff = {format(diff, ',').replace(',', ' ')} USD | "
        f"prediction = {format(pred, ',').replace(',', ' ')} USD | "
        f"target = {format(target, ',').replace(',', ' ')} USD"
    )