In [1]:
import utils_io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
path_train_data = "data_set/train.csv"

train_data = utils_io.load_csv_data(path_train_data)
# Debugging: Check the type of train_data

# Print the loaded data
if isinstance(train_data, pd.DataFrame):
    print(f"dimension train data: {train_data.shape}")
else:
    print("train_data is not a DataFrame")

Successfully loaded data from data_set/train.csv
dimension train data: (1460, 81)


In [3]:
printdata = train_data.iloc[:10]
print(train_data.shape)
print(printdata)

(1460, 81)
   Id  MSSubClass MSZoning LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL          65     8450   Pave    NA      Reg   
1   2          20       RL          80     9600   Pave    NA      Reg   
2   3          60       RL          68    11250   Pave    NA      IR1   
3   4          70       RL          60     9550   Pave    NA      IR1   
4   5          60       RL          84    14260   Pave    NA      IR1   
5   6          50       RL          85    14115   Pave    NA      IR1   
6   7          20       RL          75    10084   Pave    NA      Reg   
7   8          60       RL          NA    10382   Pave    NA      IR1   
8   9          50       RM          51     6120   Pave    NA      Reg   
9  10         190       RL          50     7420   Pave    NA      Reg   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0     NA     NA          NA       0   
1         Lvl    AllPub  ...       

In [4]:
# Get target data
target_data = train_data["SalePrice"]
print(target_data.shape)


# Drop the Id and sales price column
train_data = train_data.drop(columns=["Id"])
# train_data = train_data.drop(columns=["SalePrice"])
print(train_data.shape)

(1460,)
(1460, 80)


# Normalization


Use OneHotEncoder


In [5]:
import pandas as pd
import numpy as np


def auto_clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    df_cleaned = df.copy()

    for column in df_cleaned.columns:
        values = df_cleaned[column].dropna().unique()

        can_be_numeric = True
        for v in values:
            if isinstance(v, str) and v.strip().upper() == "NA":
                continue
            try:
                float(v)
            except:
                can_be_numeric = False
                break

        if can_be_numeric:
            # Převést "NA" na np.nan a sloupec na float
            df_cleaned[column] = df_cleaned[column].replace("NA", np.nan)
            df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors="coerce")
        else:
            # ponecháme jako kategorii se stringy
            df_cleaned[column] = df_cleaned[column].astype(str).str.strip().str.upper()

    return df_cleaned


train_data = auto_clean_columns(train_data)
print(train_data)

      MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0             60       RL         65.0     8450   PAVE    NA      REG   
1             20       RL         80.0     9600   PAVE    NA      REG   
2             60       RL         68.0    11250   PAVE    NA      IR1   
3             70       RL         60.0     9550   PAVE    NA      IR1   
4             60       RL         84.0    14260   PAVE    NA      IR1   
...          ...      ...          ...      ...    ...   ...      ...   
1455          60       RL         62.0     7917   PAVE    NA      REG   
1456          20       RL         85.0    13175   PAVE    NA      REG   
1457          70       RL         66.0     9042   PAVE    NA      REG   
1458          20       RL         68.0     9717   PAVE    NA      REG   
1459          20       RL         75.0     9937   PAVE    NA      REG   

     LandContour Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeature  \
0            LVL    ALLPUB    INSIDE  ...  

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd


def prepare_features(df: pd.DataFrame, target_column: str):
    """
    X_train = vstupní featury (normalizované, zakódované, připravené)
    y_train = cílová hodnota (ve tvém případě MSSubClass)
    transformer = pipeline pro použití na testovacích datech
    """
    df = df.copy()

    # 1. Rozdělení cílové proměnné a vstupů
    y = df[target_column]
    X = df.drop(columns=[target_column])

    # 2. Rozdělení sloupců
    numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

    # 3. Tvorba transformací
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_cols),
            (
                "cat",
                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                categorical_cols,
            ),
        ]
    )

    # 4. Pipeline (můžeš použít i s modely jako XGBoost, NN atd.)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

    # 5. Fit + transform vstupní data
    X_transformed = pipeline.fit_transform(X)

    # 6. Získání názvů nových sloupců (volitelné)
    new_columns = []

    if numerical_cols:
        new_columns += numerical_cols

    if categorical_cols:
        encoder = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
        new_columns += encoder.get_feature_names_out(categorical_cols).tolist()

    X_transformed_df = pd.DataFrame(X_transformed, columns=new_columns, index=X.index)

    return X_transformed_df, y, pipeline  # X, y, transformer pro testovací data

- X_train = vstupní featury (normalizované, zakódované, připravené)
- y_train = cílová hodnota (ve tvém případě MSSubClass)
- transformer = pipeline pro použití na testovacích datech


In [7]:
X_train, y_train, transformer = prepare_features(train_data, "SalePrice")
print(transformer)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'Be...
                                              

### prepare test data with transformer from test data


In [8]:
path_test_data = "data_set/test.csv"

test_data = utils_io.load_csv_data(path_test_data)
# Debugging: Check the type of test_data

# Print the loaded data
if isinstance(test_data, pd.DataFrame):
    print(f"dimension train data: {test_data.shape}")
else:
    print("test_data is not a DataFrame")

Successfully loaded data from data_set/test.csv
dimension train data: (1459, 80)


In [9]:
test_data = test_data.drop(columns=["Id"])
print(test_data.shape)

(1459, 79)


In [10]:
test_data = auto_clean_columns(test_data)

(1459, 79)


In [11]:
# Konverze zpět z pandas "string" typu na obyčejný Python string (pro OneHotEncoder)
for col in test_data.select_dtypes(include=["string"]).columns:
    test_data[col] = test_data[col].astype("object")

# Potom transformace
X_test = transformer.transform(test_data)

# OLD


replace NA as np.nan in numbers / replace NA witrh "NA" oin string


In [None]:
import pandas as pd
import numpy as np


def auto_clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    df_cleaned = df.copy()

    for column in df_cleaned.columns:
        values = df_cleaned[column].dropna().unique()

        can_be_numeric = True
        for v in values:
            if isinstance(v, str) and v.strip().upper() == "NA":
                continue
            try:
                float(v)
            except:
                can_be_numeric = False
                break

        if can_be_numeric:
            # Převést "NA" na np.nan a sloupec na float
            df_cleaned[column] = df_cleaned[column].replace("NA", np.nan)
            df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors="coerce")
        else:
            # ponecháme jako kategorii se stringy
            df_cleaned[column] = df_cleaned[column].astype(str).str.strip().str.upper()

    return df_cleaned


train_data = auto_clean_columns(train_data)
print(train_data)

Generate .json


In [None]:
import pandas as pd
import json


def auto_generate_description_dict(train_data: pd.DataFrame) -> dict:
    result = {}

    for column in train_data.columns:
        result[column] = {"description": "", "items": {}}

        id_counter = 0
        seen = {}

        for value in train_data[column]:
            if isinstance(value, str):
                # Add NA for first every column
                if "NA" not in seen:
                    result[column]["items"]["NA"] = {"id": id_counter}
                    seen["NA"] = True
                    id_counter += 1

                key = value.strip().upper()
                if key not in seen:
                    result[column]["items"][key] = {"id": id_counter}
                    seen[key] = True
                    id_counter += 1
            # else:
            # print(column)

    return result


#  Create descriptions from train data (DO NOT DO IT FROM TEST DATA)
description_dictionary = auto_generate_description_dict(train_data)
print(description_dictionary)

with open("data_set/generated_description.json", "w", encoding="utf-8") as f:
    json.dump(description_dictionary, f, indent=2, ensure_ascii=False)

print("✅ JSON dictionary generated.")

In [None]:
print(train_data.iloc[:10])

Replace string with values


In [None]:
import json
import re

txt_path = "data_set/data_description.txt"
json_path = "data_set/generated_description.json"


# Použití
# utils_io.parse_description_txt_to_json(txt_path, json_path)

generated_description = json.load(open(json_path))


for column in train_data.columns:
    for index, value in train_data[column].items():
        # if value == "C (all)":
        #     print(column, index, value)
        if isinstance(value, str):
            try:
                train_data.at[index, column] = generated_description[column]["items"][
                    value.strip().upper()
                ]["id"]
            except:
                print(column, index, value, isinstance(value, str))

Dobrý den,
Prosím bylo by možné zamluvit chatu v osazení
9 dospělích 2 děti do 10 let, 2 děti do 2 let,
v označeném termínu 17-20.4.


In [None]:
print(train_data.iloc[:10])

calc mean and std


In [None]:
mean = train_data.mean(axis=0)
# print(mean)
std = train_data.std(axis=0)
# print(std)

train_data = train_data - mean
train_data = train_data / std

In [None]:
column = 7
print(max(train_data.iloc[:, column]))
print(min(train_data.iloc[:, column]))

# Create NN neural network


### OLD


In [12]:
print(max(target_data))
print(min(target_data))
print(np.mean(target_data))
print(max(target_data) - min(target_data))

755000
34900
180921.19589041095
720100


In [13]:
print(type(train_data))

<class 'pandas.core.frame.DataFrame'>


In [None]:
train_data = train_data.astype("float32")
target_data = target_data.astype("float32")

# NEW


X_train, y_train, transformer


In [14]:
i = 0

k = 4
num_val_samples = len(train_data) // k
print(num_val_samples)
print(f"processing fold # {i}")
print(target_data.shape)
val_data = train_data[i * num_val_samples : (i + 1) * num_val_samples]
val_targets = target_data[i * num_val_samples : (i + 1) * num_val_samples]
print(val_data.shape)
print(val_targets.shape)

partial_train_data = pd.concat(
    [train_data[: i * num_val_samples], train_data[(i + 1) * num_val_samples :]]
)
partial_train_targets = pd.concat(
    [target_data[: i * num_val_samples], target_data[(i + 1) * num_val_samples :]]
)
print(partial_train_data.shape)
print(partial_train_targets.shape)
# print(partial_train_data.shape)
print(train_data.shape[1])

365
processing fold # 0
(1460,)
(365, 80)
(365,)
(1095, 80)
(1095,)
80


In [None]:
baseline_prediction = np.full_like(target_data, np.mean(target_data))
baseline_mae = np.mean(np.abs(target_data - baseline_prediction))
print(baseline_mae)

In [None]:
import keras
import time

start = time.time()


def build_model():
    input_shape = (79,)  # 79 parameters of house
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(128, activation="relu", input_shape=input_shape))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dense(1))  # regresní výstup
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


# def build_model():
#     model = keras.models.Sequential()
#     input_shape = (79,)  # 79 parameters of house
#     model.add(
#         keras.layers.Dense(units=256, activation="relu"),
#     )
#     model.add(keras.layers.Dense(units=256, activation="relu"))
#     model.add(keras.layers.Dense(units=256, activation="relu"))
#     model.add(keras.layers.Dense(units=1))  # output layer price of house
#     model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
#     return model


k = 5
num_val_samples = len(train_data) // k
num_epochs = 100
all_scores = []
all_histories = []

for i in range(k):
    print(f"processing fold # {i}")
    val_data = train_data[i * num_val_samples : (i + 1) * num_val_samples]
    val_targets = target_data[i * num_val_samples : (i + 1) * num_val_samples]
    partial_train_data = pd.concat(
        [train_data[: i * num_val_samples], train_data[(i + 1) * num_val_samples :]]
    )
    partial_train_targets = pd.concat(
        [target_data[: i * num_val_samples], target_data[(i + 1) * num_val_samples :]]
    )

    model = build_model()
    print("- 🧠 Spouštím trénování...")
    history = model.fit(
        partial_train_data,
        partial_train_targets,
        epochs=num_epochs,
        batch_size=64,
        verbose=0,
    )

    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
    print(f"- ✅ Fold {i + 1} finished: val_mae = {val_mae:.2f}")
    all_scores.append(val_mae)
    all_histories.append(history)

print(all_scores)
print(f"🕒 Celkový čas trénování: {time.time() - start:.2f} s")

In [None]:
model.save("house_prices_0.keras")

In [None]:
print("\n" + "=" * 50)
print("📊 Výsledky cross-validace:")
for i, score in enumerate(all_scores):
    print(f"Fold {i + 1}: MAE = {score:.2f}")

average = np.mean(all_scores)
print("-" * 50)
print(f"📈 Průměrná MAE přes {k} foldů: {average:.2f}")

In [None]:
epoch_maes = [
    [history.history["mae"][epoch] for history in all_histories]
    for epoch in range(num_epochs)
]
average_mae_history = [np.mean(epoch) for epoch in epoch_maes]
std_mae_history = [np.std(epoch) for epoch in epoch_maes]

plt.figure(figsize=(10, 6))
plt.plot(
    range(1, num_epochs + 1), average_mae_history, marker="o", label="Průměrná MAE"
)
plt.fill_between(
    range(1, num_epochs + 1),
    np.array(average_mae_history) - np.array(std_mae_history),
    np.array(average_mae_history) + np.array(std_mae_history),
    alpha=0.2,
    label="± 1 std",
)
plt.xlabel("Epoch")
plt.ylabel("Mean Absolute Error (MAE)")
plt.title("📈 Průměrná MAE s rozptylem (všechny foldy)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# Test model on test data


In [15]:
import tensorflow as tf

print("GPU available:", tf.config.list_physical_devices("GPU"))

GPU available: []


In [16]:
with tf.device("/GPU:0"):
    print("Running on GPU ✅")

Running on GPU ✅


In [17]:
from tensorflow.python.platform import build_info as tf_build_info

print("CUDA version:", tf_build_info.cuda_version_number)
print("cuDNN version:", tf_build_info.cudnn_version_number)

AttributeError: module 'tensorflow.python.platform.build_info' has no attribute 'cuda_version_number'