In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Load dataset
df = pd.read_csv("House Price Prediction Dataset.csv")

In [3]:
# Define features (X) and target (y)
X = df.drop(['Id', 'Price'], axis=1)
y = df['Price']

In [4]:
# Identify categorical columns
categorical_cols = ['Location', 'Condition', 'Garage']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [5]:
# Preprocessing: One-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'  # Keep numeric columns as is
)

In [6]:
# Create a pipeline with preprocessing + model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train the model
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [9]:
# Predict
y_pred = model.predict(X_test)

In [10]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [11]:
print(f"ðŸ“‰ Mean Squared Error: {mse:.2f}")
print(f"ðŸ“ˆ RÂ² Score: {r2:.4f}")

ðŸ“‰ Mean Squared Error: 78321466146.03
ðŸ“ˆ RÂ² Score: -0.0067
