In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, expr
from pyspark.sql.functions import sum as pyspark_sum
import os


In [5]:
import shutil

# Path to the folder you want to delete
folder_path = '/content/cleaned_valuation_data.csv'

# Check if the file exists before deleting
if os.path.exists(folder_path):
       shutil.rmtree(folder_path)  # Use shutil.rmtree to delete directories

In [6]:
# pre processing step using Py Spark
# Create Spark session
spark = SparkSession.builder \
    .appName("Data Preprocessing with PySpark") \
    .getOrCreate()

# Step 1: Load the CSV File
df = spark.read.csv("Valuation.csv", header=True, inferSchema=True)
df.show(5)
df.printSchema()

#Find and remove the uncessary arabic columns
# Identify Arabic columns (those ending with '_ar')
arabic_columns = [col_name for col_name in df.columns if col_name.endswith("_ar")]

# Drop Arabic columns
df_no_ar = df.drop(*arabic_columns)

# Step 2: Check for Missing Values
df.select([pyspark_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

# Step 3: Remove Rows with Missing Values in 'actual_worth'
df_clean = df_no_ar.filter(df_no_ar.actual_worth.isNotNull())

#print the dataframe showing no arabic columns
print("Dataframe with Arabic columns dropped")
df_clean.show(5)
# Step 4: Convert Columns to Correct Data Types
df_clean = df_clean.withColumn("instance_date", to_date(col("instance_date"), "dd-MM-yyyy")) \
                   .withColumn("procedure_area", col("procedure_area").cast("double")) \
                   .withColumn("actual_area", col("actual_area").cast("double")) \
                   .withColumn("property_sub_type_id", col("property_sub_type_id").cast("int")) \
                   .withColumn("area_id", col("area_id").cast("int"))

# Step 5: Outlier Detection and Removal using IQR Method
q1, q3 = df_clean.approxQuantile("actual_worth", [0.25, 0.75], 0.01)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df_clean = df_clean.filter((col("actual_worth") >= lower_bound) & (col("actual_worth") <= upper_bound))

# Step 6: Fill Missing Values
# Fill missing 'procedure_area' with its median
median_value = df_clean.approxQuantile("procedure_area", [0.5], 0.01)[0]
df_clean = df_clean.fillna({"procedure_area": median_value})

# Fill missing 'instance_date' with the most common date
most_common_date = df_clean.groupBy("instance_date").count().orderBy("count", ascending=False).first()[0]
df_clean = df_clean.withColumn("instance_date", expr(f"coalesce(instance_date, '{most_common_date}')"))

# Step 7: Replace Unknowns with Most Common Sub-Type
# Find the most common value in 'property_sub_type_en'
mode_value = df_clean.groupBy("property_sub_type_en").count().orderBy("count", ascending=False).first()[0]

df_clean = df_clean.withColumn("property_sub_type_en", expr(f"coalesce(nullif(property_sub_type_en, 'Unknown'), '{mode_value}')"))

# Step 8: Save the Cleaned Data to a New CSV File
df_clean.write.csv("cleaned_valuation_data.csv", header=True)

# End Spark session
spark.stop()


+------------+-----------------+-------------------+--------------+----------------+-------------+------------+---------------+--------------+----------------+----------------+----------------+--------------------+--------------------+--------------------+-------+--------------------+--------------------+-----------+--------------------+
|procedure_id|procedure_name_ar|  procedure_name_en|procedure_year|procedure_number|instance_date|actual_worth|row_status_code|procedure_area|property_type_id|property_type_ar|property_type_en|property_sub_type_id|property_sub_type_ar|property_sub_type_en|area_id|        area_name_ar|        area_name_en|actual_area|property_total_value|
+------------+-----------------+-------------------+--------------+----------------+-------------+------------+---------------+--------------+----------------+----------------+----------------+--------------------+--------------------+--------------------+-------+--------------------+--------------------+-----------+--

In [None]:
# --- Feature Engineering ---
file_path = "/content/cleaned_valuation_data.csv"
df = pd.read_csv(file_path)
df = df.drop(columns=["procedure_id", "procedure_name_en", "procedure_number", "property_type_id","instance_date"])
categorical_cols = ["property_type_en", "property_sub_type_en", "area_name_en"]
numerical_cols = ["actual_worth", "actual_area", "property_total_value"]
label_encoder = LabelEncoder()
categorical_columns = ["row_status_code", "property_type_en", "area_name_en"]
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])
if all(col in df.columns for col in ["property_type_en", "property_sub_type_en", "area_name_en"]):
    df.fillna({"property_type_en": "Unknown", "property_sub_type_en": "Unknown", "area_name_en": "Unknown"}, inplace=True)
    df[["property_type_en", "property_sub_type_en", "area_name_en"]] = df[["property_type_en", "property_sub_type_en", "area_name_en"]].astype(str)
df = pd.get_dummies(df, columns=["property_type_en", "property_sub_type_en", "area_name_en"], drop_first=True)


In [None]:
df.head()

Unnamed: 0,procedure_year,actual_worth,row_status_code,procedure_area,property_sub_type_id,area_id,actual_area,property_total_value,property_type_en_1,property_type_en_2,...,area_name_en_90,area_name_en_91,area_name_en_92,area_name_en_93,area_name_en_94,area_name_en_95,area_name_en_96,area_name_en_97,area_name_en_98,area_name_en_99
0,2005,9750000.0,1,696.77,62,239,696.77,9750000.0,True,False,...,False,False,False,False,False,False,False,False,False,False
1,2009,39883500.0,1,2470.2,62,368,2470.2,39883500.0,True,False,...,False,False,False,False,False,False,False,False,False,False
2,2004,4000000.0,1,4455.35,11,319,4455.35,4000000.0,True,False,...,False,False,False,False,False,False,False,False,False,False
3,2006,3000000.0,1,1393.55,63,369,1393.55,3000000.0,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2005,26208000.0,1,4869.6,65,378,4869.6,26208000.0,True,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
df.shape

(70740, 280)

In [None]:

average_property_value = df['property_total_value'].mean()
print(f"Average property_total_value: {average_property_value}")


Average property_total_value: 8766339.30091264


In [None]:
#scaling

numerical_cols = ["actual_worth", "actual_area", "property_total_value"]
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# View scaled data
print(df[numerical_cols].head())  # View first 5 rows
# Get descriptive statistics



   actual_worth  actual_area  property_total_value
0      0.095771    -0.118303              0.091028
1      2.958169     0.035429              2.879571
2     -0.450424     0.207514             -0.441075
3     -0.545415    -0.057902             -0.533615
4      1.659126     0.243423              1.614045


In [None]:
# --- Data Splitting --- # 80% train, 20% test
X = df.drop(columns=['actual_worth'])
y = df['actual_worth']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# --- Preprocessing Validation ---
print("X_train data types:\n", X_train.dtypes)  # Check data types after preprocessing
print("\nX_train head:\n", X_train.head())  # Check first few rows of train set

X_train data types:
 procedure_year            int64
row_status_code           int64
procedure_area          float64
property_sub_type_id      int64
area_id                   int64
                         ...   
area_name_en_95            bool
area_name_en_96            bool
area_name_en_97            bool
area_name_en_98            bool
area_name_en_99            bool
Length: 279, dtype: object

X_train head:
        procedure_year  row_status_code  procedure_area  property_sub_type_id  \
32520            2017                1         1025.25                    63   
51652            2021                1          174.77                    42   
16394            2014                1           45.00                    60   
20727            2016                1           61.97                    60   
60960            2023                1          334.45                    63   

       area_id  actual_area  property_total_value  property_type_en_1  \
32520      266    -0.089829   

# Boosting model explanation

## Why is boosting used

Boosting is a an ensemble learing technique where multiple weak learners are combined to create a stronger, more accurate model. It is typically performed during the training phase of the machine learning model

Boosting primarily aims to improve the accuracy and performance of a machine learning model by reducing the bias, improving generalization (by creating a model that is less prone to overfitting) and handling complex relationships by iteratively refining the model's predictions


## Code explanation

### Model
The model is designed to mimic the behavior of XGBoost Regression from scratch without the use of the predefined library. The class uses a collection of weak learners to create a strong predictive model

### Initialization

```
def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=10):
    self.n_estimators = n_estimators
    self.learning_rate = learning_rate
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.trees = []
    self.base_pred = 0
```

- n_estimators: The number of decision trees to be used in the model
- learning_rate: controls the step size during model updates, smaller values make the model learn more slowly but can improve generalization
- max_depth: The maximum depth of each decision tree. This parameter limits the complexity of individual trees.
- min_samples_split: The minimum number of samples required to split an internal node of a tree.
trees: A list to store the trained decision trees.
- base_pred: The initial prediction value, set to the average of the target variable.

### Gradient calculation
```
def mean_squared_error_grad(self, y_true, y_pred):
    """Gradient of MSE Loss: ∂L/∂y_pred = 2 * (y_pred - y_true)"""
    return 2 * (y_pred - y_true)
```

This function calculates the gradient of the Mean Squared Error (MSE) loss function. The gradient is used to guide the model updates during training.

### Model training
Define a function for training the defined regressor model from scratch

```
def fit(self, X, y):
    """Train the XGBoost regressor from scratch"""
    # Initialize prediction with the mean of target variable
    self.base_pred = np.mean(y)
    y_pred = np.full(y.shape, self.base_pred)

```
Initializes the prediction with the average of the target variable.

```
    for _ in range(self.n_estimators):
        # Compute negative gradients (residuals = errors made by the previous trees in predicting the target values)
        residuals = -self.mean_squared_error_grad(y, y_pred)

        # Fit weak learner (decision tree) to residuals
        tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
        tree.fit(X, residuals)

        # Get tree predictions and update overall prediction
        update = tree.predict(X)
        y_pred += self.learning_rate * update

        # Store trained tree
        self.trees.append(tree)
```
- Iterates through the specified number of estimators (n_estimators).
  - Calculates the residuals (negative gradients) based on the current prediction and actual target values.
  - Trains a decision tree on the residuals to capture the patterns in the errors.
  - Updates the overall prediction by adding the weighted prediction of the current tree.
  - Stores the trained tree in the trees list

### Prediction

```
def predict(self, X):
    """Make predictions using the trained model"""
    y_pred = np.full(X.shape[0], self.base_pred)  # Start with base prediction
    for tree in self.trees:
        y_pred += self.learning_rate * tree.predict(X)  # Add weak learner's contribution
    return y_pred
```
- Initializes the prediction with the base prediction.
- Iterates through the trained trees and adds their weighted predictions to the overall prediction.
- Returns the final prediction.

## Algorithm Explanation
XGBoost Algorithm

1.  Initialize the Model
The first prediction is the mean of the target variable (for regression) or log-odds (for classification).
Example: If predicting property_total_value, the first prediction is

  𝑦
^
=
mean
(
𝑦
)


2.  Compute Residuals (Negative Gradients)
For each sample, compute the residuals (errors) from the previous prediction.

  Residual
=
𝑦
true
−
𝑦
^


  These residuals act as the new target for the next tree.

3.  Fit a Decision Tree on Residuals
A small tree (weak learner) is trained to predict these residuals.
The tree learns which features contribute most to reducing errors.

4.  Compute the Output Value for Each Leaf
Unlike traditional boosting, XGBoost doesn't directly predict residuals.
Instead, it computes an optimal output value for each leaf of the tree, based on the residuals and second-order gradients (Hessian).

  𝑤
𝑗
=
−
(∑
Gradients /
∑
Hessians + 𝜆)


  - Gradients: First derivative of the loss function (indicates direction of error reduction).
  - Hessians: Second derivative (indicates confidence in the gradient).

  - λ: Regularization term to prevent overfitting.

5.  Update Predictions
Update the model by adding the new tree’s weighted predictions:
𝑦
^
=
𝑦
^
(previous) + 𝜂 ⋅ 𝑤 𝑗



- η: (learning rate) controls how much each tree contributes.
- Small η values require more trees but improve generalization.
6.  Repeat Steps 2–5 for Multiple Trees
Keep training new trees on updated residuals.
Stop when:
The max number of trees is reached.
The improvement in loss is below a threshold.

---

In [None]:
# --- XGBoost from Scratch Implementation ---
class XGBoostRegressorScratch:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=10):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []  # List to store weak decision tree models
        self.base_pred = 0  # Initial prediction value (for boosting)

    def mean_squared_error_grad(self, y_true, y_pred):
        """Gradient of MSE Loss: ∂L/∂y_pred = 2 * (y_pred - y_true)"""
        return 2 * (y_pred - y_true)

    def fit(self, X, y):
        """Train the XGBoost regressor from scratch"""
        # Initialize prediction with the mean of target variable
        self.base_pred = np.mean(y)
        y_pred = np.full(y.shape, self.base_pred)

        for _ in range(self.n_estimators):
            # Compute negative gradients (residuals)
            residuals = -self.mean_squared_error_grad(y, y_pred)

            # Fit weak learner (decision tree) to residuals
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X, residuals)

            # Get tree predictions and update overall prediction
            update = tree.predict(X)
            y_pred += self.learning_rate * update

            # Store trained tree
            self.trees.append(tree)

    def predict(self, X):
        """Make predictions using the trained model"""
        y_pred = np.full(X.shape[0], self.base_pred)  # Start with base prediction
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)  # Add weak learner's contribution
        return y_pred


In [None]:
# --- Cross-Validation for boosting model  ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)
rmse_scores = []
r2_scores = []
mae_scores = []
medae_scores = []
explained_variance_scores = []


for train_index, test_index in kf.split(X_train):  # Cross-validation on training set
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model = XGBoostRegressorScratch(n_estimators=100, learning_rate=0.1, max_depth=3)
    model.fit(X_train_fold, y_train_fold)

    y_pred_fold = model.predict(X_val_fold)

    # Calculate metrics for this fold
    rmse_fold = np.sqrt(np.mean((y_pred_fold - y_val_fold) ** 2))
    rmse_scores.append(rmse_fold)

    r2_fold = r2_score(y_val_fold, y_pred_fold)
    r2_scores.append(r2_fold)

    mae_fold = mean_absolute_error(y_val_fold, y_pred_fold)
    mae_scores.append(mae_fold)

    medae_fold = median_absolute_error(y_val_fold, y_pred_fold)
    medae_scores.append(medae_fold)

    explained_variance_fold = explained_variance_score(y_val_fold, y_pred_fold)
    explained_variance_scores.append(explained_variance_fold)


# Print results for training set
print(f"Mean RMSE (Cross-Validation): {np.mean(rmse_scores):.2f}")
print(f"Standard Deviation of RMSE (Cross-Validation): {np.std(rmse_scores):.2f}")
print(f"Mean R-squared (Cross-Validation): {np.mean(r2_scores):.2f}")
print(f"Standard Deviation of R-squared (Cross-Validation): {np.std(r2_scores):.2f}")
print(f"Mean MAE (Cross-Validation) Training: {np.mean(mae_scores):.2f}")
print(f"Standard Deviation of MAE (Cross-Validation)Training: {np.std(mae_scores):.2f}")
print(f"Mean MedAE (Cross-Validation) Training: {np.mean(medae_scores):.2f}")
print(f"Standard Deviation of MedAE (Cross-Validation) Training: {np.std(medae_scores):.2f}")
print(f"Mean Explained Variance (Cross-Validation) Training: {np.mean(explained_variance_scores):.2f}")
print(f"Standard Deviation of Explained Variance (Cross-Validation)Training: {np.std(explained_variance_scores):.2f}")

Mean RMSE (Cross-Validation): 0.06
Standard Deviation of RMSE (Cross-Validation): 0.02
Mean R-squared (Cross-Validation): 1.00
Standard Deviation of R-squared (Cross-Validation): 0.00
Mean MAE (Cross-Validation) Training: 0.01
Standard Deviation of MAE (Cross-Validation)Training: 0.00
Mean MedAE (Cross-Validation) Training: 0.01
Standard Deviation of MedAE (Cross-Validation) Training: 0.00
Mean Explained Variance (Cross-Validation) Training: 1.00
Standard Deviation of Explained Variance (Cross-Validation)Training: 0.00


In [None]:
# --- Final Evaluation on Test Set for XG boosting  ---
model = XGBoostRegressorScratch(n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train, y_train)  # Train on the entire training set
y_pred_test = model.predict(X_test)

# Calculate RMSE
rmse_test = np.sqrt(np.mean((y_pred_test - y_test) ** 2))

# Calculate R-squared
r2_test = r2_score(y_test, y_pred_test)

# Calculate MAE
mae_test = mean_absolute_error(y_test, y_pred_test)

# Calculate mean explained variance
explained_variance_test = explained_variance_score(y_test, y_pred_test)

# Print results
print(f"RMSE on Test Set: {rmse_test:.2f}")
print(f"R-squared on Test Set: {r2_test:.2f}")
print(f"MAE on Test Set: {mae_test:.2f}")
print(f"Mean Explained Variance on Test Set: {explained_variance_test:.2f}")

RMSE on Test Set: 0.05
R-squared on Test Set: 1.00
MAE on Test Set: 0.01
Mean Explained Variance on Test Set: 1.00


#**Bagging Ensemble Model Explanation**
Bagging (Bootstrap Aggregating) is an ensemble learning technique that improves the accuracy and stability of machine learning models by combining multiple weak learners (typically decision trees) into a stronger model. The key idea behind bagging is:

**Reducing Variance**: Each model is trained on a random subset of data, making the final prediction less sensitive to individual data variations.

**Improving Generalization**: Averaging multiple predictions reduces overfitting and enhances robustness.

**Handling Noise**: By training different models on different bootstrapped samples, bagging helps mitigate the impact of outliers and noisy data.

Bagging is particularly useful for high-variance models such as decision trees, where individual trees may be sensitive to specific data points.

# **Code Explanation**

**Model**

The model implements a Bagging Regressor from scratch, using multiple DecisionTreeRegressor models. It follows a standard ensemble learning approach by training multiple models on random subsets of data and averaging their predictions.

**Initialization**
```
def __init__(self, base_estimator=DecisionTreeRegressor, n_estimators=10, max_samples=0.8):
    self.base_estimator = base_estimator
    self.n_estimators = n_estimators
    self.max_samples = max_samples
    self.models = []

```
base_estimator: Specifies the weak learner (Decision Tree Regressor in this case).

n_estimators: Number of base models (trees) to train.

max_samples: Proportion of training data used for each model (default: 80% of dataset).

models: Stores the trained models.  

```
def fit(self, X, y):
    np.random.seed(42)
    n_samples = int(self.max_samples * len(X))
    for _ in range(self.n_estimators):
        sample_indices = np.random.choice(len(X), n_samples, replace=True)
        X_sample, y_sample = X.iloc[sample_indices], y.iloc[sample_indices]
        model = self.base_estimator()
        model.fit(X_sample, y_sample)
        self.models.append(model)
```
Creates n_estimators decision trees.

Each tree is trained on a random subset of data (bootstrap sampling).

Trees are stored in self.models.

```
def predict(self, X):
    predictions = np.array([model.predict(X) for model in self.models])
    return np.mean(predictions, axis=0)
```

Each model makes a prediction.

The final prediction is the average of all model outputs, reducing variance.
    


In [None]:
# Bagging Ensemble Model from Scratch
class BaggingRegressorScratch:
    def __init__(self, base_estimator=DecisionTreeRegressor, n_estimators=10, max_samples=0.8):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.models = []

    def fit(self, X, y):
        np.random.seed(42)
        n_samples = int(self.max_samples * len(X))
        for _ in range(self.n_estimators):
            sample_indices = np.random.choice(len(X), n_samples, replace=True)
            X_sample, y_sample = X.iloc[sample_indices], y.iloc[sample_indices]
            model = self.base_estimator()
            model.fit(X_sample, y_sample)
            self.models.append(model)

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        return np.mean(predictions, axis=0)

In [None]:
# Perform 10-Fold Cross-Validation on training set for Bagging model
from sklearn.metrics import mean_squared_error
kf = KFold(n_splits=10, shuffle=True, random_state=42)
rmse_scores, r2_scores, mae_scores = [], [], []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    model = BaggingRegressorScratch(n_estimators=10)
    model.fit(X_train_fold, y_train_fold)
    y_pred_fold = model.predict(X_val_fold)

    rmse_scores.append(np.sqrt(mean_squared_error(y_val_fold, y_pred_fold)))
    r2_scores.append(r2_score(y_val_fold, y_pred_fold))
    mae_scores.append(mean_absolute_error(y_val_fold, y_pred_fold))

# Print Cross-Validation Results
print(f"Mean RMSE (Cross-Validation) Training: {np.mean(rmse_scores):.2f}")
print(f"Mean R-squared (Cross-Validation) Training: {np.mean(r2_scores):.2f}")
print(f"Mean MAE (Cross-Validation): Training {np.mean(mae_scores):.2f}")

Mean RMSE (Cross-Validation) Training: 0.06
Mean R-squared (Cross-Validation) Training: 1.00
Mean MAE (Cross-Validation): Training 0.00


In [None]:
# --- Final Evaluation on Test Set for BaggingRegressorScratch ---
bagging_model = BaggingRegressorScratch(n_estimators=10)  # Initialize your Bagging model
bagging_model.fit(X_train, y_train)  # Train on the entire training set
y_pred_test_bagging = bagging_model.predict(X_test)

# Calculate RMSE
rmse_test_bagging = np.sqrt(mean_squared_error(y_test, y_pred_test_bagging))

# Calculate R-squared
r2_test_bagging = r2_score(y_test, y_pred_test_bagging)

# Calculate MAE
mae_test_bagging = mean_absolute_error(y_test, y_pred_test_bagging)

# Calculate mean explained variance
explained_variance_test_bagging = explained_variance_score(y_test, y_pred_test_bagging)


# Print results
print(f"Bagging - RMSE on Test Set: {rmse_test_bagging:.2f}")
print(f"Bagging - R-squared on Test Set: {r2_test_bagging:.2f}")
print(f"Bagging - MAE on Test Set: {mae_test_bagging:.2f}")
print(f"Bagging - Mean Explained Variance on Test Set: {explained_variance_test_bagging:.2f}")

Bagging - RMSE on Test Set: 0.05
Bagging - R-squared on Test Set: 1.00
Bagging - MAE on Test Set: 0.00
Bagging - Mean Explained Variance on Test Set: 1.00
