In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load Rents data
Rents = pd.read_csv('Rents & Transactions/rents.csv', delimiter=';', low_memory=False)

In [None]:
# Load Transactions Data
Transactions = pd.read_csv('Rents & Transactions/transactions.csv', delimiter=';', low_memory=False)

In [None]:
Rents.info()

In [None]:
Rents.head()

In [None]:
# Convert 'Property Size (sq.m)' and 'Annual Amount' to numeric
Rents['Property Size (sq.m)'] = pd.to_numeric(Rents['Property Size (sq.m)'], errors='coerce')
Rents['Annual Amount'] = pd.to_numeric(Rents['Annual Amount'], errors='coerce')
Rents['Contract Amount'] = pd.to_numeric(Rents['Contract Amount'], errors='coerce')

In [None]:
# Convert dates to datetime
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')
Rents['Start Date'] = pd.to_datetime(Rents['Start Date'], errors='coerce')
Rents['End Date'] = pd.to_datetime(Rents['End Date'], errors='coerce')

In [None]:
# Feature engineering
Rents['Contract Duration (days)'] = (Rents['End Date'] - Rents['Start Date']).dt.days
# Rents['Price per sq.m'] = Rents['Annual Amount'] / Rents['Property Size (sq.m)']

# Rents['Price per sq.m'] = pd.to_numeric(Rents['Price per sq.m'], errors='coerce')

# Check results
print(Rents.info())
print(Rents.head())

In [None]:
# Handling outliers using IQR method
Q1 = Rents['Annual Amount'].quantile(0.25)
Q3 = Rents['Annual Amount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

Rents['Annual Amount'] = Rents['Annual Amount'].clip(lower=lower_bound, upper=upper_bound)

# Plot after outlier handling
sns.boxplot(x=Rents['Annual Amount'])
plt.title("Boxplot of Annual Amount After Outlier Handling")
plt.show()


In [None]:
# Impute 'End Date' based on average duration
avg_duration = Rents['Contract Duration (days)'].mean()
Rents['End Date'] = Rents['End Date'].fillna(
    Rents['Start Date'] + pd.to_timedelta(avg_duration, unit='d')
)

# Recalculate 'Contract Duration (days)'
Rents['Contract Duration (days)'] = (Rents['End Date'] - Rents['Start Date']).dt.days

# Check updated missing values
print(Rents[['End Date', 'Contract Duration (days)']].isnull().sum())


In [None]:

# Target variable analysis
plt.figure(figsize=(10, 5))
sns.histplot(Rents['Annual Amount'], kde=True, bins=50)
plt.title("Distribution of Annual Amount")
plt.show()

sns.boxplot(x=Rents['Annual Amount'])
plt.title("Boxplot of Annual Amount")
plt.show()

# Correlation heatmap for numerical features
numerical_cols = ['Annual Amount', 'Contract Amount', 'Property Size (sq.m)']
correlation = Rents[numerical_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Categorical features vs Annual Amount
plt.figure(figsize=(12, 5))
sns.boxplot(x='Property Type', y='Annual Amount', data=Rents)
plt.title("Property Type vs Annual Amount")
plt.xticks(rotation=90)
plt.show()

# Missing data visualization
missing_data = Rents.isnull().mean().sort_values(ascending=False)
plt.figure(figsize=(10, 5))
missing_data[missing_data > 0].plot(kind='bar')
plt.title("Missing Data Percentage")
plt.show()


In [None]:
# Filter data for 'Building' in Property Type
unit_data = Rents[Rents['Property Type'] == 'Building']

# Summary statistics for 'Unit'
print(unit_data['Annual Amount'].describe())

# Plot distribution of 'Annual Amount' for 'Unit'
sns.histplot(unit_data['Annual Amount'], kde=True, bins=50)
plt.title("Distribution of Annual Amount for 'Unit'")
plt.show()

# Compare with other property types
plt.figure(figsize=(12, 5))
sns.boxplot(x='Property Type', y='Annual Amount', data=Rents)
plt.title("Annual Amount by Property Type")
plt.xticks(rotation=45)
plt.show()


In [None]:
'''
# Convert columns to numeric (this will coerce invalid parsing to NaN)
Rents = Rents.apply(pd.to_numeric, errors='coerce')

# After conversion, check for infinity and large values again
print("Check for infinity values:")
print((Rents == float('inf')).sum())
print((Rents == float('-inf')).sum())
'''

In [None]:

# Average prices the previous month/week (for the same kind of property)

# Step 1: Convert dates to datetime format
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')

# Step 2: Extract year, month, and week from Transaction Date
Rents['Year'] = Rents['Registration Date'].dt.year
Rents['Month'] = Rents['Registration Date'].dt.month
Rents['Week'] = Rents['Registration Date'].dt.isocalendar().week

# Step 3: Define property characteristics for grouping
property_characteristics = ['Area', 'Property Type', 'Property Sub Type', 'Usage', 'Is Free Hold?']

# Step 4: Calculate average prices for the previous month
Rents['Prev_Month_Avg_Price'] = (
    Rents.groupby(property_characteristics + ['Year', 'Month'])['Annual Amount']
    .transform(lambda x: x.shift().mean())
)

# Step 5: Calculate average prices for the previous week
Rents['Prev_Week_Avg_Price'] = (
    Rents.groupby(property_characteristics + ['Year', 'Week'])['Annual Amount']
    .transform(lambda x: x.shift().mean())
)

# Verify the new columns
print(Rents[['Prev_Month_Avg_Price', 'Prev_Week_Avg_Price']].head())


In [None]:
'''
# Handle missing values
Rents['Property Sub Type'].fillna('Unknown', inplace=True)
Rents['Room(s)'].fillna('Unknown', inplace=True)
Rents['Parking'].fillna('Unknown', inplace=True)
Rents['Nearest Metro'].fillna('Unknown', inplace=True)
Rents['Nearest Mall'].fillna('Unknown', inplace=True)
Rents['Nearest Landmark'].fillna('Unknown', inplace=True)
Rents['Project'].fillna('Unknown', inplace=True)

# Drop rows where the target variable 'Amount' is missing (if any)
Rents.dropna(subset=['Annual Amount'], inplace=True)
'''

In [None]:
# Model

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb

# 1. Prepare the data
# Drop rows where 'Annual Amount' is missing
Rents = Rents.dropna(subset=['Annual Amount'])

# Define features and target
X = Rents.drop(columns=['Annual Amount', 'Ejari Contract Number', 'Registration Date', 'Start Date', 'End Date'])
y = Rents['Annual Amount']

# 2. Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Catboost Model
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Handle NaN in categorical features
cat_columns = X_train.select_dtypes(include=['object']).columns
for col in cat_columns:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# Categorical features: Get the indices of the categorical columns in X_train
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype == 'object']

# Step 1: Define the CatBoost model
model = CatBoostRegressor(iterations=1000,  # Number of boosting iterations
                          learning_rate=0.1,  # Learning rate
                          depth=6,  # Tree depth
                          cat_features=cat_features,  # List of categorical feature indices
                          random_seed=42,  # Random seed for reproducibility
                          verbose=200)  # Print progress every 200 iterations

# Step 2: Train the model
model.fit(X_train, y_train, cat_features=cat_features)

# Step 3: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2:.4f}")  # R-squared score

In [None]:
# Feature Importance

import matplotlib.pyplot as plt

# Get feature importance from the trained model
feature_importance = model.get_feature_importance()

# Create a DataFrame to map feature names with importance scores
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the importance values in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for CatBoost')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important features on top
plt.show()


In [None]:
# LightGBM Model

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import math
import lightgbm as lgb



# Step 1: Convert categorical columns to category dtype
categorical_columns = ['Version', 'Area', 'Is Free Hold?', 'Property Type', 
                       'Property Sub Type', 'Usage', 'Nearest Metro', 
                       'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project']

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Step 2: Define the LightGBM model
lgb_model = lgb.LGBMRegressor(objective='regression', 
                              num_iterations=1000,  # Number of boosting iterations
                              learning_rate=0.1,  # Learning rate
                              max_depth=6,  # Tree depth
                              random_state=42)  # Random seed for reproducibility

# Step 3: Train the model with categorical features
lgb_model.fit(X_train, y_train, categorical_feature=categorical_columns)

# Step 4: Make predictions on the test set
y_pred_lgb = lgb_model.predict(X_test)

# Step 5: Evaluate the model
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
rmse_lgb = mse_lgb ** 0.5
r2_lgb = r2_score(y_test, y_pred_lgb)  # Calculate R² Score

# Print the evaluation metrics
print(f"LightGBM MSE: {mse_lgb:.4f}")   # Mean Squared Error
print(f"LightGBM RMSE: {rmse_lgb:.4f}")  # Root Mean Squared Error
print(f"LightGBM R² Score: {r2_lgb:.4f}")  # R-squared score

# Step 6: Plot feature importance for LightGBM
lgb_feature_importance = lgb_model.feature_importances_

# Create a DataFrame to map feature names with importance scores
lgb_feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': lgb_feature_importance
})

# Sort the importance values in ascending order
lgb_feature_importance_df = lgb_feature_importance_df.sort_values(by='Importance', ascending=True)

# Plot LightGBM feature importance
plt.figure(figsize=(10, 6))
plt.barh(lgb_feature_importance_df['Feature'], lgb_feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for LightGBM')
plt.show()



In [None]:
#####################################

In [None]:
Transactions.info()

In [None]:
# Summary statistics for numerical columns
print(Transactions.describe())

In [None]:
# Check for unique values in categorical columns
print(Transactions.select_dtypes(include=['object']).nunique())

In [None]:

# Handle missing values
Transactions['Property Sub Type'].fillna('Unknown', inplace=True)
Transactions['Room(s)'].fillna('Unknown', inplace=True)
Transactions['Parking'].fillna('Unknown', inplace=True)
Transactions['Nearest Metro'].fillna('Unknown', inplace=True)
Transactions['Nearest Mall'].fillna('Unknown', inplace=True)
Transactions['Nearest Landmark'].fillna('Unknown', inplace=True)
Transactions['Project'].fillna('Unknown', inplace=True)

# Drop rows where the target variable 'Amount' is missing (if any)
Transactions.dropna(subset=['Amount'], inplace=True)


In [None]:
# Convert 'Property Size (sq.m)' and 'Annual Amount' to numeric
Transactions['Property Size (sq.m)'] = pd.to_numeric(Transactions['Property Size (sq.m)'], errors='coerce')
Transactions['Amount'] = pd.to_numeric(Transactions['Amount'], errors='coerce')
Transactions['Transaction Size (sq.m)'] = pd.to_numeric(Transactions['Transaction Size (sq.m)'], errors='coerce')

In [None]:
# Plot after outlier handling
sns.boxplot(x=Transactions['Amount'])
plt.title("Boxplot of Amount Before Outlier Handling")
plt.show()


In [None]:
import numpy as np

# Detect outliers using IQR (Interquartile Range)
Q1 = Transactions['Amount'].quantile(0.25)
Q3 = Transactions['Amount'].quantile(0.75)
IQR = Q3 - Q1
outliers = (Transactions['Amount'] < (Q1 - 1.5 * IQR)) | (Transactions['Amount'] > (Q3 + 1.5 * IQR))

# Remove outliers
Transactions = Transactions[~outliers]


In [None]:
# Plot after outlier handling
sns.boxplot(x=Transactions['Amount'])
plt.title("Boxplot of Amount After Outlier Handling")
plt.show()

In [None]:
Transactions['Transaction Type'].unique()

In [None]:
# Filter data for 'Unit' in Property Type
unit_data = Transactions[Transactions['Transaction Type'] == 'Sales']

# Summary statistics for 'Unit'
print(unit_data['Transaction Type'].describe())

In [None]:
unit_data['Transaction Type'].unique()

In [None]:
# Detect outliers in 'Amount' using IQR
Q1 = unit_data['Amount'].quantile(0.25)
Q3 = unit_data['Amount'].quantile(0.75)
IQR = Q3 - Q1
outliers = (unit_data['Amount'] < (Q1 - 1.5 * IQR)) | (unit_data['Amount'] > (Q3 + 1.5 * IQR))

# Remove outliers from the dataset
unit_data = unit_data[~outliers]


In [None]:

# Plot distribution of 'Amount' for 'Sales'
sns.histplot(unit_data['Amount'], kde=True, bins=50)
plt.title("Distribution of Amount for 'Sales'")
plt.show()

# Compare with other property types
plt.figure(figsize=(12, 5))
sns.boxplot(x='Transaction Type', y='Amount', data=Transactions)
plt.title("Amount by Transaction Type 'Sales'")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Convert 'Property Size (sq.m)' and 'Annual Amount' to numeric
unit_data['Property Size (sq.m)'] = pd.to_numeric(unit_data['Property Size (sq.m)'], errors='coerce')
unit_data['Amount'] = pd.to_numeric(unit_data['Amount'], errors='coerce')
unit_data['Transaction Size (sq.m)'] = pd.to_numeric(unit_data['Transaction Size (sq.m)'], errors='coerce')

In [None]:
'''
# Feature engineering: Extract year, month, and weekday from 'Transaction Date'
unit_data['Transaction Date'] = pd.to_datetime(unit_data['Transaction Date'], errors='coerce')
unit_data['Transaction Year'] = unit_data['Transaction Date'].dt.year
unit_data['Transaction Month'] = unit_data['Transaction Date'].dt.month
unit_data['Transaction Day'] = unit_data['Transaction Date'].dt.day
unit_data['Transaction Weekday'] = unit_data['Transaction Date'].dt.weekday

# Create a feature 'Size Ratio' (Property Size / Transaction Size)
unit_data['Size Ratio'] = unit_data['Property Size (sq.m)'] / unit_data['Transaction Size (sq.m)']
'''

In [None]:
# Feature engineering: Extract year, month, and weekday from 'Transaction Date'
unit_data['Transaction Date'] = pd.to_datetime(unit_data['Transaction Date'], errors='coerce')
unit_data['Year'] = unit_data['Transaction Date'].dt.year
unit_data['Month'] = unit_data['Transaction Date'].dt.month
unit_data['Day'] = unit_data['Transaction Date'].dt.day
unit_data['Week'] = unit_data['Transaction Date'].dt.weekday

# Create a feature 'Size Ratio' (Property Size / Transaction Size)
unit_data['Size Ratio'] = unit_data['Property Size (sq.m)'] / unit_data['Transaction Size (sq.m)']


In [None]:
# Average prices the previous month/week (for the same kind of property)

# Step 3: Define property characteristics for grouping
property_characteristics = ['Area', 'Property Type', 'Property Sub Type', 'Usage', 'Is Free Hold?']

# Step 4: Calculate average prices for the previous month
unit_data['Prev_Month_Avg_Price'] = (
    unit_data.groupby(property_characteristics + ['Year', 'Month'])['Amount']
    .transform(lambda x: x.shift().mean())
)

# Step 5: Calculate average prices for the previous week
unit_data['Prev_Week_Avg_Price'] = (
    unit_data.groupby(property_characteristics + ['Year', 'Week'])['Amount']
    .transform(lambda x: x.shift().mean())
)

# Verify the new columns
print(unit_data[['Prev_Month_Avg_Price', 'Prev_Week_Avg_Price']].head())

In [None]:
# Model

In [None]:
# Define target and features
X = unit_data.drop(columns=['Amount', 'Transaction Number', 'Transaction Date'])
y = unit_data['Amount']

# Split the data into training and test sets (80-20 split)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the dimensions of the splits
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import math

# Identify categorical columns
categorical_cols = [
    'Transaction Type', 'Transaction sub type', 'Registration type', 
    'Is Free Hold?', 'Usage', 'Area', 'Property Type', 
    'Property Sub Type', 'Room(s)', 'Parking', 'Nearest Metro', 
    'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project'
]

# Ensure the categorical columns are treated as category dtype
for col in categorical_cols:
    if col in X_train.columns:  # Check if column exists in the data
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

# Train LightGBM model
lgb_model = LGBMRegressor(objective='regression', 
                              num_iterations=1000,  # Number of boosting iterations
                              learning_rate=0.1,  # Learning rate
                              max_depth=6,  # Tree depth
                              random_state=42)

lgb_model.fit(X_train, y_train, categorical_feature=categorical_cols)

# Predict on test data
lgb_preds = lgb_model.predict(X_test)

# Step 5: Evaluate the model
mse_lgb = mean_squared_error(y_test, lgb_preds)
rmse_lgb = mse_lgb ** 0.5
r2_lgb = r2_score(y_test, lgb_preds)  # Calculate R² Score

# Print the evaluation metrics
print(f"LightGBM MSE: {mse_lgb:.4f}")   # Mean Squared Error
print(f"LightGBM RMSE: {rmse_lgb:.4f}")  # Root Mean Squared Error
print(f"LightGBM R² Score: {r2_lgb:.4f}")  # R-squared score


In [None]:
# Plot LightGBM feature importance
import matplotlib.pyplot as plt

lgb_feature_importance = lgb_model.feature_importances_
plt.figure(figsize=(10, 6))
sorted_idx = lgb_feature_importance.argsort()
plt.barh(X_train.columns[sorted_idx], lgb_feature_importance[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.show()


In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import math

# Identify categorical columns
categorical_cols = [
    'Transaction Type', 'Transaction sub type', 'Registration type', 
    'Is Free Hold?', 'Usage', 'Area', 'Property Type', 
    'Property Sub Type', 'Room(s)', 'Parking', 'Nearest Metro', 
    'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project'
]

In [None]:
# Fill NaN values in categorical columns with a placeholder string 'missing'
for col in categorical_cols:
    if col in X_train.columns:
        # Define 'missing' as a valid category for both train and test
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

        # Set 'missing' as an additional category by reassigning the column
        X_train[col] = X_train[col].cat.add_categories('missing')
        X_test[col] = X_test[col].cat.add_categories('missing')

        # Fill NaN values with 'missing'
        X_train[col] = X_train[col].fillna('missing')
        X_test[col] = X_test[col].fillna('missing')

# Ensure the categorical columns are treated as category dtype
for col in categorical_cols:
    if col in X_train.columns:  # Check if column exists in the data
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

# Categorical features: Get the indices of the categorical columns in X_train
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype.name == 'category']

# Train CatBoost model
cat_model = CatBoostRegressor(iterations=1000,  # Number of boosting iterations
                              learning_rate=0.1,  # Learning rate
                              depth=6,  # Tree depth
                              cat_features=cat_features,  # List of categorical feature indices
                              random_seed=42,  # Random seed for reproducibility
                              verbose=200)  # Print progress every 200 iterations

cat_model.fit(X_train, y_train)

# Predict on test data
cat_preds = cat_model.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, cat_preds)
rmse = mse ** 0.5
r2 = r2_score(y_test, cat_preds)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2:.4f}")  # R-squared score


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature importance and sort them from highest to lowest
cat_feature_importance = cat_model.get_feature_importance()
sorted_idx = np.argsort(cat_feature_importance)[::-1]  # Sort in descending order

# Sort the features and importances
sorted_features = X_train.columns[sorted_idx]
sorted_importance = cat_feature_importance[sorted_idx]

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(sorted_features, sorted_importance)
plt.xlabel('Importance')
plt.title('Feature Importance for CatBoost')
plt.gca().invert_yaxis()
plt.show()
