In [0]:
import os
import pandas as pd


current_path = os.getcwd()  
target_path = os.path.abspath(os.path.join(
    current_path,
    "../../../data/house_price_prediction/House Price Prediction Dataset.csv"
))

# import our data from csv file
df = pd.read_csv(target_path)

# show two rows
df.head()


Id:: A unique identifier assigned to each house record in the dataset.

Area:: The total living or built-up space of the house, usually measured in square feet or square meters.

Bedrooms:: The number of bedrooms in the house.

Bathrooms:: The number of bathrooms in the house.

Floors:: The total number of floors or stories in the house.

YearBuilt:: The year the house was originally constructed.

Location:: The geographic area or neighborhood where the house is located.

Condition:: The overall physical state or quality rating of the house.

Garage:: Information about whether the house has a garage and possibly its capacity.

Price:: The market or sale price of the house.

In [0]:
df.isnull().sum()

So there is no missing values

In [0]:
# check duplicated
df.duplicated().sum()

# drop duplicated
df = df.drop_duplicates()


In [0]:
df["Location"].value_counts()

In [0]:
df["Condition"].unique()


In [0]:
df.info()

In [0]:
# we should create age column instead of YearBuilt
df['Age'] = 2025 - df['YearBuilt']
df['Age'].head(3)

In [0]:
df.columns

In [0]:
# Delete unimportant columns
df.drop(columns=['Id','YearBuilt'],inplace=True)
df.columns

In [0]:
df.head()

In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))

# Histogram with attractive colors, white edges, and slight transparency
plt.hist(df["Price"], bins=50, color='#69b3a2', edgecolor='white', linewidth=1, alpha=0.85)

# Add title and axis labels with larger font
plt.title("Price Distribution", fontsize=16, weight='bold')
plt.xlabel("Price", fontsize=12)
plt.ylabel("Count", fontsize=12)

# Format x-axis numbers with thousands separator
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{int(x):,}'))

# Add a light grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout to prevent overlapping
plt.tight_layout()

plt.show()


In [0]:
# boxplot of price distribution
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.boxplot(data=df, x="Condition", y="Price", palette="Pastel1")
plt.title("Price by Condition")

plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{int(x):,}'))

plt.show()


In [0]:
# scatter plot of area vs price
import plotly.express as px

fig = px.scatter(
    df,
    x="Age",
    y="Price",
    color="Garage",
    size="Area",
    size_max=10,  
    opacity=0.6,
    title="Price vs Age (interactive)"
)
fig.show()


In [0]:
# heatmap of correlation 
plt.figure(figsize=(6,4))
sns.heatmap(df[["Area","Bedrooms","Bathrooms","Floors","Price","Age"]].corr(),
            annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))  # أصغر شكل

# Histogram with closer bars
plt.hist(df["Bathrooms"], bins=30, color='#ffa07a', edgecolor='white', linewidth=1,
         alpha=0.85, rwidth=0.9)  

# Add title and axis labels with larger font
plt.title("Bathrooms Distribution", fontsize=14, weight='bold')
plt.xlabel("Bathrooms", fontsize=11)
plt.ylabel("Count", fontsize=11)

# Format x-axis numbers
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{int(x):,}'))

# Add a light grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [0]:
# Pie Chart: Distribution of Garage Types
import matplotlib.pyplot as plt

# Pie chart showing distribution of Garage types
garage_counts = df['Garage'].value_counts()  # compute counts
plt.figure(figsize=(6,6))

colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']  # custom color palette
plt.pie(garage_counts, labels=garage_counts.index, autopct="%1.1f%%",
        startangle=120, colors=colors, shadow=True, explode=[0.05]*len(garage_counts))

plt.title("Distribution of Garage Types", fontsize=14, weight='bold')
plt.axis('equal')  # make pie chart circular
plt.show()



In [0]:
# Boxplots: Price by Location, Condition, and Garage
plt.figure(figsize=(12,10))

# Price by Location
plt.subplot(3,1,1)
sns.boxplot(x='Location', y='Price', data=df, palette='Blues')
plt.title("Price by Location", fontsize=12, weight='bold')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{int(x):,}'))


# Price by Condition
plt.subplot(3,1,2)
sns.boxplot(x='Condition', y='Price', data=df, palette='Greens')
plt.title("Price by Condition", fontsize=12, weight='bold')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{int(x):,}'))


# Price by Garage
plt.subplot(3,1,3)
sns.boxplot(x='Garage', y='Price', data=df, palette='Oranges')
plt.title("Price by Garage", fontsize=12, weight='bold')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{int(x):,}'))


plt.tight_layout()
plt.show()


In [0]:
df.drop(columns=['Age_jitter','Price_jitter'],inplace=True)
df.columns


In [0]:
# split the data
X = df.drop("Price", axis=1)
y = df["Price"]
X.head()

In [0]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [0]:
numeric_features = ["Area", "Bedrooms", "Bathrooms", "Floors", "Age"]
categorical_features = ["Location", "Condition", "Garage"]


In [0]:
# column transformer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)



In [0]:
# models used
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
lr_model = LinearRegression()
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42)


In [0]:
# create the pipeline for the models
from sklearn.pipeline import Pipeline

rf_pipeline = Pipeline([("preprocessor", preprocessor), ("regressor", rf_model)])
lr_pipeline = Pipeline([("preprocessor", preprocessor), ("regressor", lr_model)])
gb_pipeline = Pipeline([("preprocessor", preprocessor), ("regressor", gb_model)])


In [0]:
# train the models
rf_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)
gb_pipeline.fit(X_train, y_train)


In [0]:
# evaluation of models
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pipelines = {"Random Forest": rf_pipeline, "Linear Regression": lr_pipeline, "Gradient Boosting": gb_pipeline}

for name, pipe in pipelines.items():
    y_pred = pipe.predict(X_test)
    print(f"--- {name} ---")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("R²:", r2_score(y_test, y_pred))
    print()


In [0]:
import matplotlib.pyplot as plt

y_pred_lr = lr_pipeline.predict(X_test)

plt.figure(figsize=(6,4))
plt.scatter(y_test, y_pred_lr, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Linear Regression: Actual vs Predicted")
plt.show()


In [0]:
# Make Predictions on Test Data
# Predictions for all models
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_lr = lr_pipeline.predict(X_test)
y_pred_gb = gb_pipeline.predict(X_test)

# Display first 5 predictions for each model
print("Random Forest Predictions:", y_pred_rf[:5])
print("Linear Regression Predictions:", y_pred_lr[:5])
print("Gradient Boosting Predictions:", y_pred_gb[:5])


In [0]:
# Make Prediction for a New House Example
# Example new house
new_house = {
    "Area": 3000,
    "Bedrooms": 3,
    "Bathrooms": 2,
    "Floors": 2,
    "Location": "Downtown",
    "Condition": "Good",
    "Garage": "Yes",
    "Age": 30
}

import pandas as pd
new_df = pd.DataFrame([new_house])

# Predictions
pred_rf = rf_pipeline.predict(new_df)[0]
pred_lr = lr_pipeline.predict(new_df)[0]
pred_gb = gb_pipeline.predict(new_df)[0]

print(f"Predicted Price (Random Forest): {pred_rf}")
print(f"Predicted Price (Linear Regression): {pred_lr}")
print(f"Predicted Price (Gradient Boosting): {pred_gb}")


# Conclusion
The machine learning models—Random Forest, Linear Regression, and Gradient Boosting—provide different perspectives on predicting house prices. Tree-based models like Random Forest and Gradient Boosting capture complex nonlinear relationships between features such as Area, Bedrooms, Age, and Condition, and often provide higher accuracy and lower prediction errors compared to linear models. Linear Regression, while simpler, gives clear insights into which numerical features most influence price through its coefficients.

From the feature importance of tree-based models, we can identify which attributes most impact the prediction, for instance, Area and Age are likely the strongest predictors. Examining the distribution of Price helps understand skewness and potential outliers, which explains variations in model performance.

By comparing evaluation metrics (MAE, MSE, R²), we can determine the most reliable model for predicting house prices. Further improvements can be achieved by adding derived features (e.g., TotalRooms, IsMultiFloor), collecting more diverse data for categorical variables, handling outliers, and tuning hyperparameters using GridSearchCV or RandomizedSearchCV.

These insights help stakeholders make informed decisions about housing valuation and reveal which property characteristics most strongly influence market price.

In [0]:
# -----------------------------
# Expected Insights Extraction Guidelines
# -----------------------------

# 1️⃣ Feature Importance:
# - For tree-based models (Random Forest, Gradient Boosting):
#   - Use the feature_importances_ attribute to see which features most influence the prediction.
#   - Example:
#       importance = rf_model.named_steps['regressor'].feature_importances_
#       print(sorted(zip(X_train.columns, importance), key=lambda x: x[1], reverse=True))
# - For linear models (Linear, Ridge, Lasso):
#   - Look at the coefficients (coef_) to identify influential numeric features.

# 2️⃣ Price Distribution:
# - Visualize the target variable to understand skewness or outliers:
#   - Use seaborn.histplot(df['Price'], kde=True)
#   - Apply log transformation if Price is highly skewed to improve regression performance.
# - This insight helps explain why some models may underperform.

# 3️⃣ Model Comparison:
# - After evaluating multiple models (Linear, Ridge, Lasso, Random Forest, Gradient Boosting):
#   - Collect metrics: MAE, MSE, R2 Score
#   - Compare them in a table or DataFrame for clarity.
#   - Example:
#       results_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
# - This helps determine which model predicts house prices most accurately.

# 4️⃣ Recommendations:
# - Suggest ways to improve model performance based on observed results:
#   - Add new features (e.g., HouseAge, TotalRooms, IsMultiFloor)
#   - Collect more diverse data for Location, Condition, YearBuilt
#   - Handle outliers or skewed distributions
#   - Tune model hyperparameters (GridSearchCV / RandomizedSearchCV)
# - These recommendations are drawn from observed model weaknesses and dataset characteristics.
