In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# ===============================
# 1. IMPORT LIBRARIES
# ===============================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# ===============================
# 2. LOAD DATA
# ===============================
df = pd.read_csv("/kaggle/input/ames-housing-dataset/AmesHousing.csv")
print(df.shape)
df.head()


(2930, 82)


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [2]:
# Remove useless ID column if present
df = df.drop(columns=["Order"], errors='ignore')
df.head()


Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,215000
1,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,244000
4,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
# Create TotalBsmtSF since dataset does not have it directly
df["TotalBsmtSF"] = (
    df["BsmtFin SF 1"] +
    df["BsmtFin SF 2"] +
    df["Bsmt Unf SF"]
)

# Create TotalSF = Total basement + 1st floor + 2nd floor
df["TotalSF"] = (
    df["TotalBsmtSF"] +
    df["1st Flr SF"] +
    df["2nd Flr SF"]
)


In [6]:
# Fill NA for numerical values where NA = no feature
num_fill_zero = [
    "Garage Yr Blt", "Garage Cars", "Garage Area",
    "BsmtFin SF 1", "BsmtFin SF 2", "Bsmt Unf SF",
    "Bsmt Full Bath", "Bsmt Half Bath", "TotalBsmtSF"
]

for col in num_fill_zero:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Fill remaining numerical columns with median
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [7]:
ordinal_cols = {
    "Exter Qual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Exter Cond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Bsmt Qual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "Bsmt Cond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "Heating QC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Kitchen Qual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Fireplace Qu": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "Garage Qual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "Garage Cond": ["None", "Po", "Fa", "TA", "Gd", "Ex"]
}


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
import numpy as np

# Separate target and predictors
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])

# Ordinal columns
ordinal_features = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())

# One-Hot categorical features (exclude ordinal)
oh_features = list(set(X.select_dtypes(include=['object']).columns) - set(ordinal_features))

# Numeric features (exclude any categorical)
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Remove target column if accidentally present
numeric_features = [col for col in numeric_features if col != "SalePrice"]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(categories=ordinal_categories), ordinal_features),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), oh_features),
        ("scale", StandardScaler(), numeric_features)
    ],
    remainder="drop"   # Drop remaining columns if any
)


In [10]:
# ---------------------------------------------------
# 8. Log-transform target (SalePrice)
# ---------------------------------------------------
y_log = np.log1p(y)   # log(1 + SalePrice)


# ---------------------------------------------------
# 9. Train-test split
# ---------------------------------------------------
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)


# ---------------------------------------------------
# 10. Model Pipeline (Linear Regression)
# ---------------------------------------------------
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

model = Pipeline(steps=[
    ("preprocess", preprocessor),   # your corrected transformer
    ("model", LinearRegression())
])

# Train
model.fit(X_train, y_train)

# Predict
preds = model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE:", rmse)


RMSE: 80162057.02462673
