In [0]:
import os
import pandas as pd


current_path = os.getcwd()  
target_path = os.path.abspath(os.path.join(
    current_path,
    "../../../data/house_price_prediction/House Price Prediction Dataset.csv"
))

# import our data from csv file
df = pd.read_csv(target_path)

# show two rows
df.head(2)


In [0]:
df.columns

Id:: A unique identifier assigned to each house record in the dataset.

Area:: The total living or built-up space of the house, usually measured in square feet or square meters.

Bedrooms:: The number of bedrooms in the house.

Bathrooms:: The number of bathrooms in the house.

Floors:: The total number of floors or stories in the house.

YearBuilt:: The year the house was originally constructed.

Location:: The geographic area or neighborhood where the house is located.

Condition:: The overall physical state or quality rating of the house.

Garage:: Information about whether the house has a garage and possibly its capacity.

Price:: The market or sale price of the house.

In [0]:
print(f"missing value in data{df.isnull().sum()}")
#show duplicated value
print(f"Duplicated value ={df.duplicated().sum()}")

In [0]:
df.drop(columns=['Id'],inplace=True)


In [0]:
# ---- Histograms: Price and Bathrooms Distribution ----

plt.style.use('dark_background')
sns.set_style("darkgrid")

plt.figure(figsize=(12, 5))

# Price Histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Price'], kde=True, color='cyan')
plt.title('Price Distribution')

# Bathrooms Histogram
plt.subplot(1, 2, 2)
sns.histplot(df['Bathrooms'], color='lightgreen')
plt.title('Bathrooms Distribution')

plt.tight_layout()
plt.show()


In [0]:
# pie char
plt.subplot(1,2,2)
    plt.pie(counts,labels = counts.index,autopct = "%1.1f%%",startangle=90)
    plt.title(f"Percentage of {col}")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

In [0]:
# correlatiion matrix 
correlatton_mat = df[numerical].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlatton_mat, annot = True ,fmt=".2f")
plt.title('Correlation Matrix of Numerical Feature')
plt.show()

In [0]:
# ---- Scatter Plot: Price vs Area ----

sns.scatterplot(x='Area', y='Price', data=df, color='magenta')
plt.title('Price vs Area')
plt.show()


In [0]:
# ---- Heatmap: Correlation Between Numerical Variables ----

numeric_cols = ['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Price']

sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [0]:
# ---- Boxplots: Price by Location, Condition, and Garage ----

plt.figure(figsize=(14, 10))

# Location
plt.subplot(3, 1, 1)
sns.boxplot(x='Location', y='Price', data=df)
plt.title('Price by Location')

# Condition
plt.subplot(3, 1, 2)
sns.boxplot(x='Condition', y='Price', data=df)
plt.title('Price by Condition')

# Garage
plt.subplot(3, 1, 3)
sns.boxplot(x='Garage', y='Price', data=df)
plt.title('Price by Garage')

plt.tight_layout()
plt.show()


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# تحديد الميزات
X = df.drop(columns=['Id','Price'])
y = df['Price']

# تقسيم الميزات حسب النوع
numeric_features = ['Area','Bedrooms','Bathrooms','Floors','YearBuilt']
categorical_features = ['Location','Condition','Garage']

# تقسيم البيانات إلى تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [0]:
# المعالجة للبيانات الرقمية والفئوية
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])


In [0]:
# إنشاء نموذج مع pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])

# تدريب النموذج
model.fit(X_train, y_train)


In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


In [0]:
# ---- Linear Regression ----

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split



# تجهيز Preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
])

# إنشاء وتدريب النموذج
lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])
lr_model.fit(X_train, y_train)

# التنبؤ والتقييم
y_pred = lr_model.predict(X_test)
print("Linear Regression:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


In [0]:
# Plot the regression scatter plot
plot_linear_regression(model_lr, X_test, y_test, info_dict)

In [0]:
# ---- Ridge Regression ----

from sklearn.linear_model import Ridge

ridge_model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', Ridge(alpha=1.0))])
ridge_model.fit(X_train, y_train)

y_pred = ridge_model.predict(X_test)
print("Ridge Regression:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


In [0]:
# ---- Gradient Boosting Regression ----

from sklearn.ensemble import GradientBoostingRegressor

gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor(n_estimators=200, random_state=42))])
gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)
print("Gradient Boosting Regression:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


In [0]:
# -----------------------------
# Expected Insights Extraction Guidelines
# -----------------------------

# 1️⃣ Feature Importance:
# - For tree-based models (Random Forest, Gradient Boosting):
#   - Use the feature_importances_ attribute to see which features most influence the prediction.
#   - Example:
#       importance = rf_model.named_steps['regressor'].feature_importances_
#       print(sorted(zip(X_train.columns, importance), key=lambda x: x[1], reverse=True))
# - For linear models (Linear, Ridge, Lasso):
#   - Look at the coefficients (coef_) to identify influential numeric features.

# 2️⃣ Price Distribution:
# - Visualize the target variable to understand skewness or outliers:
#   - Use seaborn.histplot(df['Price'], kde=True)
#   - Apply log transformation if Price is highly skewed to improve regression performance.
# - This insight helps explain why some models may underperform.

# 3️⃣ Model Comparison:
# - After evaluating multiple models (Linear, Ridge, Lasso, Random Forest, Gradient Boosting):
#   - Collect metrics: MAE, MSE, R2 Score
#   - Compare them in a table or DataFrame for clarity.
#   - Example:
#       results_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
# - This helps determine which model predicts house prices most accurately.

# 4️⃣ Recommendations:
# - Suggest ways to improve model performance based on observed results:
#   - Add new features (e.g., HouseAge, TotalRooms, IsMultiFloor)
#   - Collect more diverse data for Location, Condition, YearBuilt
#   - Handle outliers or skewed distributions
#   - Tune model hyperparameters (GridSearchCV / RandomizedSearchCV)
# - These recommendations are drawn from observed model weaknesses and dataset characteristics.
