In [3]:
# 1. Load libraries
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
# 2. Load dataset
data = pd.read_csv("train.csv")
# 3. Feature selection (6 features)
features = [
    "OverallQual",
    "GrLivArea",
    "TotalBsmtSF",
    "GarageCars",
    "YearBuilt",
    "Neighborhood"
]

target = "SalePrice"

X = data[features]
y = data[target]
# 4. Handling missing values
X["TotalBsmtSF"].fillna(0, inplace=True)
X["GarageCars"].fillna(0, inplace=True)
# 5. Define numerical and categorical columns
numerical_features = [
    "OverallQual",
    "GrLivArea",
    "TotalBsmtSF",
    "GarageCars",
    "YearBuilt"
]

categorical_features = ["Neighborhood"]
# 6. Preprocessing
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", pd.get_dummies)
])
# 7. Encode categorical feature manually
X_encoded = pd.get_dummies(X, columns=categorical_features)
# 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)
# 9. Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 10. Model implementation
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

model.fit(X_train_scaled, y_train)
# 11. Model evaluation
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)
# 12. Save model and scaler
joblib.dump(
    {
        "model": model,
        "scaler": scaler,
        "columns": X_encoded.columns
    },
    "house_price_model.pkl"
)
# 13. Reload model (no retraining)
loaded_data = joblib.load("house_price_model.pkl")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["TotalBsmtSF"].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["TotalBsmtSF"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to 

MAE: 18398.24546355186
MSE: 798964437.476091
RMSE: 28265.95898737722
R2 Score: 0.8958369675689336
