# House Price Prediction in Canada


In [1]:

# ## Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


## Load Dataset

In [2]:

data = pd.read_csv("/Users/manish/Documents/Adobe/ca_real_state.csv")


## Explore Dataset

In [3]:

print(data.head())
print(data.info())
print(data.describe())

# Check for missing values
print(data.isnull().sum())


    Price  Bedrooms  Bathrooms  SqFt       City Province  Year_Built  \
0  873630         5          2  1010   Montreal       BC        1960   
1  377869         2          2  3591    Toronto       ON        1958   
2  128030         4          1  3823   Montreal       ON        2002   
3  117730         3          2  2848   Montreal       QC        1975   
4  292476         4          1  3659  Vancouver       QC        2018   

        Type  Garage  Lot_Area  
0      Condo       1      7919  
1      House       1      7304  
2      House       0      4548  
3  Apartment       1      3374  
4      Condo       1      1281  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Price       5000 non-null   int64 
 1   Bedrooms    5000 non-null   int64 
 2   Bathrooms   5000 non-null   int64 
 3   SqFt        5000 non-null   int64 
 4   City        5000 

 ## Data Preprocessing

In [9]:

# Fill missing values (if any) or drop rows/columns
data.fillna(method='ffill', inplace=True)

# Separate features and target
X = data.drop("Price", axis=1)
y = data["Price"]

# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=["object"]).columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns


  data.fillna(method='ffill', inplace=True)


In [11]:

# ## Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Preprocessing Pipelines

In [13]:

# Pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)


## Modeling Pipeline

In [15]:

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])


## Train the Model

In [17]:

model_pipeline.fit(X_train, y_train)


## Evaluate the Model

In [19]:

# Predictions
y_pred_train = model_pipeline.predict(X_train)
y_pred_test = model_pipeline.predict(X_test)

# Metrics
print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))


Train MAE: 84106.2384775
Test MAE: 229121.26236000002
Train RMSE: 98626.57871390971
Test RMSE: 267560.5637258383


## Feature Importance (for RandomForest)

In [41]:

model = model_pipeline.named_steps['model']
if hasattr(model, 'feature_importances_'):
    feature_names = numerical_features.tolist() + list(
        model_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features))
    feature_importances = pd.DataFrame(
        {'Feature': feature_names, 'Importance': model.feature_importances_}
    ).sort_values(by='Importance', ascending=False)
    print(feature_importances)


           Feature  Importance
5         Lot_Area    0.242047
2             SqFt    0.236246
3       Year_Built    0.180294
0         Bedrooms    0.063004
1        Bathrooms    0.043596
4           Garage    0.025536
13     Province_ON    0.020594
6     City_Calgary    0.019927
9     City_Toronto    0.018827
10  City_Vancouver    0.018479
8      City_Ottawa    0.017810
7    City_Montreal    0.017524
12     Province_BC    0.017238
16      Type_Condo    0.016389
17      Type_House    0.016012
15  Type_Apartment    0.015978
11     Province_AB    0.015767
14     Province_QC    0.014732


## Save the Model

In [None]:
import joblib
joblib.dump(model_pipeline, 'house_price_model.pkl')


 ## Load and Use the Model

In [31]:
loaded_model = joblib.load('house_price_model.pkl')
# Adjust sample data to include all columns expected by the model
sample_data = pd.DataFrame({
    'Province': ['Manitoba'],  
    'City': ['Winnipeg'],      
    'Type': ['Detached'],      
    'SqFt': [1500],
    'Bedrooms': [3],
    'Bathrooms': [2],
    'Garage': [1],            
    'Year_Built': [2005],     
    'Lot_Area': [5000]         
})
 # Replace with actual features
sample_prediction = loaded_model.predict(sample_data)
print("Sample Prediction:", sample_prediction)


Sample Prediction: [622458.91]
