In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the datasets
Rents = pd.read_csv('Rents & Transactions/rents.csv', delimiter=';', low_memory=False)
Transactions = pd.read_csv('Rents & Transactions/transactions.csv', delimiter=';', low_memory=False)

In [None]:
Rents.info()

In [None]:
Transactions.info()

In [None]:
Rents['Property Size (sq.m)'] = pd.to_numeric(Rents['Property Size (sq.m)'], errors='coerce')

In [None]:
Rents['Parking'] = pd.to_numeric(Rents['Parking'], errors='coerce')

In [None]:
Transactions['Property Size (sq.m)'] = pd.to_numeric(Transactions['Property Size (sq.m)'], errors='coerce')

In [None]:
Transactions['Parking'] = pd.to_numeric(Transactions['Parking'], errors='coerce')

In [None]:
# Shared columns with high similarity to include in the merge
merge_keys = [
    'Property ID', 'Is Free Hold?', 'Nearest Metro', 
    'Nearest Mall', 'Nearest Landmark', 
    'Usage', 'Area', 'Property Type', 'Property Sub Type', 'Property Size (sq.m)', 'Parking', 'Master Project', 'Project'
]

# Perform the merge using these keys
merged_data = pd.merge(
    Rents, Transactions,
    on=merge_keys,  # Merge on the identified keys
    how='outer',     # Retain all rows from Rents
    suffixes=('', '_Transactions')  # Add suffix for Transactions columns
)

# Check the merged data
print("Merged Data Information:")
print(merged_data.info())

print("\nPreview of Merged Data:")
print(merged_data.head())


In [None]:
# Convert numeric columns stored as strings
merged_data['Annual Amount'] = pd.to_numeric(merged_data['Annual Amount'], errors='coerce')
merged_data['Contract Amount'] = pd.to_numeric(merged_data['Contract Amount'], errors='coerce')
merged_data['Amount'] = pd.to_numeric(merged_data['Amount'], errors='coerce')


In [None]:
# Average prices the previous month/week (for the same kind of property)

# Step 1: Convert dates to datetime format
merged_data['Registration Date'] = pd.to_datetime(merged_data['Registration Date'], errors='coerce')

# Step 2: Extract year, month, and week from Transaction Date
merged_data['Year'] = merged_data['Registration Date'].dt.year
merged_data['Month'] = merged_data['Registration Date'].dt.month
merged_data['Week'] = merged_data['Registration Date'].dt.isocalendar().week

# Step 3: Define property characteristics for grouping
property_characteristics = ['Area', 'Property Type', 'Property Sub Type', 'Usage', 'Is Free Hold?']

# Step 4: Calculate average prices for the previous month
merged_data['Prev_Month_Avg_Price'] = (
    merged_data.groupby(property_characteristics + ['Year', 'Month'])['Amount']
    .transform(lambda x: x.shift().mean())
)

# Step 5: Calculate average prices for the previous week
merged_data['Prev_Week_Avg_Price'] = (
    merged_data.groupby(property_characteristics + ['Year', 'Week'])['Amount']
    .transform(lambda x: x.shift().mean())
)

# Verify the new columns
print(merged_data[['Prev_Month_Avg_Price', 'Prev_Week_Avg_Price']].head())

In [None]:
merged_data['Prev_Month_Avg_Price'].unique()

In [None]:
merged_data['Prev_Week_Avg_Price'].unique()

In [None]:
# Convert dates to datetime format
merged_data['Start Date'] = pd.to_datetime(merged_data['Start Date'], errors='coerce')

# Convert dates to datetime format
merged_data['End Date'] = pd.to_datetime(merged_data['End Date'], errors='coerce')

# Convert dates to datetime format
merged_data['Transaction Date'] = pd.to_datetime(merged_data['Transaction Date'], errors='coerce')


In [None]:
merged_data['Registration Date'].unique()

In [None]:
merged_data.info()

In [None]:
merged_data.head()

In [None]:
merged_data['Room(s)'].unique()

In [None]:
'''
# Sample 10% of the data
sampled_data = merged_data.sample(frac=0.1, random_state=42)

# Display info and preview of sampled data
print(sampled_data.info())
print(sampled_data.head())
'''

In [None]:
# Model

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Prepare the data
# Drop rows where 'Annual Amount' is missing
merged_data = merged_data.dropna(subset=['Annual Amount'])

# Define features and target
X = merged_data.drop(columns=['Annual Amount', 'Ejari Contract Number', 'Registration Date', 'Start Date', 'End Date', 'Transaction Date'])
y = merged_data['Annual Amount']

# 2. Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
'''
# Identify categorical columns
categorical_cols = [
    'Transaction Type', 'Transaction sub type', 'Registration type', 
    'Is Free Hold?', 'Usage', 'Area', 'Property Type', 
    'Property Sub Type', 'Room(s)', 'Parking', 'Nearest Metro', 
    'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project'
]

# Ensure the categorical columns are treated as category dtype
for col in categorical_cols:
    if col in X_train.columns:  # Check if column exists in the data
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

# Categorical features: Get the indices of the categorical columns in X_train
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype.name == 'category']
'''

In [None]:
# Catboost Model
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Handle NaN in categorical features
cat_columns = X_train.select_dtypes(include=['object']).columns
for col in cat_columns:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# Categorical features: Get the indices of the categorical columns in X_train
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype == 'object']

# Step 1: Define the CatBoost model
model = CatBoostRegressor(iterations=200,  # Number of boosting iterations
                          learning_rate=0.1,  # Learning rate
                          depth=6,  # Tree depth
                          cat_features=cat_features,  # List of categorical feature indices
                          random_seed=42,  # Random seed for reproducibility
                          verbose=200)  # Print progress every 200 iterations

# Step 2: Train the model
model.fit(X_train, y_train, cat_features=cat_features)

# Step 3: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2:.4f}")  # R-squared score

In [None]:
# Feature Importance

import matplotlib.pyplot as plt

# Get feature importance from the trained model
feature_importance = model.get_feature_importance()

# Create a DataFrame to map feature names with importance scores
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the importance values in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance of Annual Amount for CatBoost (Rents and Transactions Data)')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important features on top
plt.show()


In [None]:
# LightGBM Model
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Step 1: Convert categorical columns to category dtype
categorical_columns = ['Version', 'Area', 'Is Free Hold?', 'Property Type', 
                       'Property Sub Type', 'Usage', 'Nearest Metro', 
                       'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project',
                      'Transaction Number', 'Transaction Type', 'Transaction sub type',
                       'Registration type', 'Room(s)'
                      ]

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Step 2: Define the LightGBM model
lgb_model = lgb.LGBMRegressor(objective='regression', 
                              num_iterations=200,  # Number of boosting iterations
                              learning_rate=0.1,  # Learning rate
                              max_depth=6,  # Tree depth
                              verbose=-1,
                              random_state=42)  # Random seed for reproducibility

# Step 3: Train the model with categorical features
lgb_model.fit(X_train, y_train, categorical_feature=categorical_columns)

# Step 4: Make predictions on the test set
y_pred_lgb = lgb_model.predict(X_test)

# Step 5: Evaluate the model
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
rmse_lgb = mse_lgb ** 0.5
r2_lgb = r2_score(y_test, y_pred_lgb)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse_lgb:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse_lgb:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2_lgb:.4f}")  # R-squared score


# Step 6: Plot feature importance for LightGBM
lgb_feature_importance = lgb_model.feature_importances_

# Create a DataFrame to map feature names with importance scores
lgb_feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': lgb_feature_importance
})

# Sort the importance values in ascending order
lgb_feature_importance_df = lgb_feature_importance_df.sort_values(by='Importance', ascending=True)

# Plot LightGBM feature importance
plt.figure(figsize=(10, 6))
plt.barh(lgb_feature_importance_df['Feature'], lgb_feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance of Annual Amount for LightGBM (Rents and Transactions)')
plt.show()


In [None]:
'''
# Step 1: Check all categorical columns in X_train
cat_columns = X_train.select_dtypes(include=['object', 'category']).columns

# Ensure that 'Version' and any other categorical column are included in cat_features
cat_features = [i for i, col in enumerate(X_train.columns) if col in cat_columns]

# Handle NaN in categorical features
for col in cat_columns:
    # Convert column to 'category' type if it's not already
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

    # Check if 'Unknown' is already a category, and add it if not
    if 'Unknown' not in X_train[col].cat.categories:
        X_train[col] = X_train[col].cat.add_categories('Unknown')
    if 'Unknown' not in X_test[col].cat.categories:
        X_test[col] = X_test[col].cat.add_categories('Unknown')

    # Fill NaN values with 'Unknown'
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')
'''

In [None]:
###################################################

In [None]:
# Adding supplementary datasets

In [None]:
# Load data
Consumer_Price_Index_Annually = pd.read_csv('Consumer Price Index/Consumer_Price_Index_Annually.csv')

In [None]:
# Filter relevant columns from CPI data
cpi_data = Consumer_Price_Index_Annually[['MEASURE', 'TIME_PERIOD', 'OBS_VALUE']].copy()
cpi_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'CPI_Value'}, inplace=True)

# Handle duplicate entries: aggregate by Year and MEASURE (e.g., take the mean of OBS_VALUE)
cpi_data = cpi_data.groupby(['Year', 'MEASURE'], as_index=False).mean()

# Pivot CPI data to have a column for each MEASURE
cpi_data_pivot = cpi_data.pivot(index='Year', columns='MEASURE', values='CPI_Value').reset_index()

In [None]:
cpi_data_pivot.head()

In [None]:
cpi_data_pivot.info()

In [None]:
# Load data
Consumer_Price_Index_Monthly = pd.read_csv('Consumer Price Index/Consumer_Price_Index_Monthly.csv')

In [None]:
# Step 1: Prepare the Monthly CPI Data
monthly_cpi_data = Consumer_Price_Index_Monthly[['MEASURE', 'TIME_PERIOD', 'OBS_VALUE']].copy()

# Extract Year and Month from TIME_PERIOD
monthly_cpi_data['Year'] = pd.to_datetime(monthly_cpi_data['TIME_PERIOD']).dt.year
monthly_cpi_data['Month'] = pd.to_datetime(monthly_cpi_data['TIME_PERIOD']).dt.month

# Rename columns for clarity
monthly_cpi_data.rename(columns={'OBS_VALUE': 'CPI_Value'}, inplace=True)

# Ensure CPI_Value is numeric
monthly_cpi_data['CPI_Value'] = pd.to_numeric(monthly_cpi_data['CPI_Value'], errors='coerce')

# Step 2: Handle duplicate entries (aggregate by Year, Month, and MEASURE)
# Aggregate only numeric data
monthly_cpi_data = monthly_cpi_data.groupby(['Year', 'Month', 'MEASURE'], as_index=False)['CPI_Value'].mean()

# Step 3: Pivot the Monthly CPI Data
monthly_cpi_pivot = monthly_cpi_data.pivot(
    index=['Year', 'Month'], 
    columns='MEASURE', 
    values='CPI_Value'
).reset_index()

# Check the processed data
print(monthly_cpi_pivot.info())
print(monthly_cpi_pivot.head())


In [None]:
# Merge
merged_data2 = merged_data.merge(monthly_cpi_pivot, on=["Year", "Month"], how="left")

In [None]:
merged_data2.info()

In [None]:
# Load data
Consumer_Price_Index_Quarterly = pd.read_csv('Consumer Price Index/Consumer_Price_Index_Quarterly.csv')

In [None]:
Consumer_Price_Index_Quarterly.head(10)

In [None]:
# Step 1: Prepare the Quarterly CPI Data
quarterly_cpi_data = Consumer_Price_Index_Quarterly[['MEASURE', 'TIME_PERIOD', 'OBS_VALUE']].copy()

# Extract Year and Quarter from TIME_PERIOD
quarterly_cpi_data['Year'] = quarterly_cpi_data['TIME_PERIOD'].str[:4].astype(int)  # Extract Year
quarterly_cpi_data['Quarter'] = quarterly_cpi_data['TIME_PERIOD'].str[-2:]  # Extract Quarter (e.g., 'Q1', 'Q2')

# Rename columns for clarity
quarterly_cpi_data.rename(columns={'OBS_VALUE': 'CPI_Value'}, inplace=True)

# Ensure CPI_Value is numeric
quarterly_cpi_data['CPI_Value'] = pd.to_numeric(quarterly_cpi_data['CPI_Value'], errors='coerce')

# Step 2: Handle duplicate entries (aggregate by Year, Quarter, and MEASURE)
quarterly_cpi_data = quarterly_cpi_data.groupby(['Year', 'Quarter', 'MEASURE'], as_index=False)['CPI_Value'].mean()

# Step 3: Pivot the Quarterly CPI Data
quarterly_cpi_pivot = quarterly_cpi_data.pivot(
    index=['Year', 'Quarter'], 
    columns='MEASURE', 
    values='CPI_Value'
).reset_index()

# Check the processed data
print(quarterly_cpi_pivot.info())
print(quarterly_cpi_pivot.head())


In [None]:
# Load data
Currency_Strength = pd.read_csv('Currency Strength/AED-USD.csv')

In [None]:
# Step 1: Prepare the Currency Strength Data
currency_data = Currency_Strength[['Date', 'Close', 'Return']].copy()

# Step 2: Convert Date column to datetime
currency_data['Date'] = pd.to_datetime(currency_data['Date'], errors='coerce')

# Step 3: Extract Year and Month
currency_data['Year'] = currency_data['Date'].dt.year
currency_data['Month'] = currency_data['Date'].dt.month

# Step 4: Handle duplicates (aggregate by Year and Month)
currency_data_aggregated = currency_data.groupby(['Year', 'Month'], as_index=False).agg({
    'Close': 'mean',    # Average close value for each month
    'Return': 'mean'    # Average return for each month
})

# Step 5: Rename Columns for Clarity
currency_data_aggregated.rename(columns={
    'Close': 'Average_Close',
    'Return': 'Average_Return'
}, inplace=True)

# Step 6: Validate the Processed Data
print(currency_data_aggregated.info())
print(currency_data_aggregated.head())


In [None]:
currency_data_aggregated.rename(columns={
    "Average_Close": "AEDUSD_Average_Close",
    "Average_Return": "AEDUSD_Average_Return"
}, inplace=True)


In [None]:
# Merge
merged_data3 = merged_data2.merge(currency_data_aggregated, on=["Year", "Month"], how="left")


In [None]:
merged_data3.info()

In [None]:
# Load data
GDP_Quarterly_Constant_Prices = pd.read_csv('Gross Domestic Product/GDP_Quarterly_Constant_Prices.csv')

In [None]:
GDP_Quarterly_Constant_Prices.head(30)

In [None]:
GDP_Quarterly_Constant_Prices['MEASURE'].unique()

In [None]:
GDP_Quarterly_Constant_Prices['TIME_PERIOD'].unique()

In [None]:
# Step 1: Prepare the GDP Quarterly Data
gdp_quarterly_data = GDP_Quarterly_Constant_Prices[['TIME_PERIOD', 'MEASURE', 'OBS_VALUE']].copy()

# Step 2: Rename 'TIME_PERIOD' to 'Year' directly since it already represents the year
gdp_quarterly_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'GDP_Value'}, inplace=True)

# Step 3: Extract Quarter from the 'QUARTER' column
gdp_quarterly_data['Quarter'] = GDP_Quarterly_Constant_Prices['QUARTER']

# Step 4: Handle Duplicate Entries
# Ensure no duplicates by aggregating using .mean(), but adjust if necessary
gdp_quarterly_data = gdp_quarterly_data.groupby(['Year', 'Quarter', 'MEASURE'], as_index=False).mean()

# Step 5: Pivot the Data
gdp_quarterly_pivot = gdp_quarterly_data.pivot(
    index=['Year', 'Quarter'], 
    columns='MEASURE', 
    values='GDP_Value'
).reset_index()

# Step 6: Fill Missing Values (Optional)
# Fill missing values with 0 or another placeholder, as appropriate
gdp_quarterly_pivot.fillna(0, inplace=True)

# Step 7: Validate the Processed Data
print(gdp_quarterly_pivot.info())
print(gdp_quarterly_pivot.head(20))


In [None]:
'''
ACC – Accommodation & Hospitality (Hotels, Restaurants, Tourism)
ACT – Activities (General services, business activities, or arts and entertainment)
AGR – Agriculture, Forestry & Fishing
ART – Arts, Culture & Recreation
CON – Construction
EDU – Education
ELE – Electricity, Gas & Water Supply (Energy sector)
FIN – Financial Services (Banking, Insurance, Investments)
HUM – Human Health & Social Work Activities
INF – Information & Communication (Telecom, IT services, Media)
MAN – Manufacturing
MIN – Mining & Quarrying
NFC – Non-Financial Corporations (Could be general businesses excluding financial institutions)
PRO – Professional, Scientific & Technical Activities
PUB – Public Administration & Defense (Government services)
REA – Real Estate Activities
TOT_GDP – Total GDP (Overall economic output)
TOT_NO – Total Number (Could be employment figures or total enterprises)
TRA – Transportation & Storage
WHO – Wholesale & Retail Trade
'''

In [None]:
# Convert Quarter to Year by averaging
yearly_gdp_data = gdp_quarterly_pivot.drop(columns=["Quarter"]).groupby("Year", as_index=False).mean()

yearly_gdp_data.info()

In [None]:
# Merge
merged_data4 = merged_data3.merge(yearly_gdp_data, on="Year", how="left")

In [None]:
# Load data
GDP_Quarterly_Current_Prices = pd.read_csv('Gross Domestic Product/GDP_Quarterly_Current_Prices.csv')

In [None]:
# Step 1: Prepare the GDP Quarterly Current Prices Data
gdp_quarterly_current_data = GDP_Quarterly_Current_Prices[['TIME_PERIOD', 'MEASURE', 'OBS_VALUE']].copy()

# Step 2: Rename 'TIME_PERIOD' to 'Year' directly since it already represents the year
gdp_quarterly_current_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'GDP_Value'}, inplace=True)

# Step 3: Extract Quarter from the 'QUARTER' column
gdp_quarterly_current_data['Quarter'] = GDP_Quarterly_Current_Prices['QUARTER']

# Step 4: Handle Duplicate Entries
# Ensure no duplicates by aggregating using .mean(), but adjust if necessary
gdp_quarterly_current_data = gdp_quarterly_current_data.groupby(['Year', 'Quarter', 'MEASURE'], as_index=False).mean()

# Step 5: Pivot the Data
gdp_quarterly_current_pivot = gdp_quarterly_current_data.pivot(
    index=['Year', 'Quarter'], 
    columns='MEASURE', 
    values='GDP_Value'
).reset_index()

# Step 6: Fill Missing Values (Optional)
# Fill missing values with 0 or another placeholder, as appropriate
gdp_quarterly_current_pivot.fillna(0, inplace=True)

# Step 7: Validate the Processed Data
print(gdp_quarterly_current_pivot.info())
print(gdp_quarterly_current_pivot.head())


In [None]:
# Convert Quarter to Year by averaging
gdp_yearly_current = gdp_quarterly_current_pivot.drop(columns=["Quarter"]).groupby("Year", as_index=False).mean()

gdp_yearly_current.info()

In [None]:
gdp_yearly_current.rename(columns={col: col + "_current" for col in gdp_yearly_current.columns if col != "Year"}, inplace=True)
gdp_yearly_current.info()

In [None]:
# Merge
merged_data5 = merged_data4.merge(gdp_yearly_current, on="Year", how="left")


In [None]:
merged_data5.info()

In [None]:
# Load data
Population_Gender = pd.read_csv('Population/Population_Estimates_and_Growth_by_Gender.csv')

In [None]:
Population_Gender.head(10)

In [None]:
Population_Gender['GENDER'].unique()

In [None]:
# Step 1: Prepare the Population by Gender Data
population_gender_data = Population_Gender[['TIME_PERIOD', 'GENDER', 'OBS_VALUE']].copy()

# Step 2: Rename 'TIME_PERIOD' to 'Year' and 'OBS_VALUE' to 'Population'
population_gender_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'Population'}, inplace=True)

# Step 3: Pivot the Data
# Create separate columns for Male and Female populations based on the 'GENDER' column
population_gender_pivot = population_gender_data.pivot(
    index='Year',
    columns='GENDER',
    values='Population'
).reset_index()

# Step 4: Fill Missing Values (Optional)
# Fill missing values with 0 or another placeholder, as appropriate
population_gender_pivot.fillna(0, inplace=True)

# Step 5: Validate the Processed Data
print(population_gender_pivot.info())
print(population_gender_pivot.head())


In [None]:
# Column Rename
population_gender_pivot.rename(columns={
    "F": "Population_F",
    "M": "Population_M",
    "_T": "Population_Total"
}, inplace=True)


In [None]:
# Merge
merged_data6 = merged_data5.merge(population_gender_pivot, on="Year", how="left")


In [None]:
merged_data6.info()

In [None]:
# Load data
Population_Indicators = pd.read_csv('Population/Population_Indicators.csv')

In [None]:
# Step 1: Prepare the Population Indicators Data
population_indicators_data = Population_Indicators[['TIME_PERIOD', 'MEASURE', 'OBS_VALUE']].copy()

# Step 2: Rename 'TIME_PERIOD' to 'Year' and 'OBS_VALUE' to 'Value'
population_indicators_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'Value'}, inplace=True)

# Step 3: Handle Duplicate Entries
# Aggregate using .mean() (or another aggregation method like sum) if there are duplicate Year + MEASURE combinations
population_indicators_data = population_indicators_data.groupby(['Year', 'MEASURE'], as_index=False).mean()

# Step 4: Pivot the Data
# Pivot the data based on the 'MEASURE' column to separate indicators into individual columns
population_indicators_pivot = population_indicators_data.pivot(
    index='Year',
    columns='MEASURE',
    values='Value'
).reset_index()

# Step 5: Fill Missing Values (Optional)
# Fill missing values with 0 or another placeholder, as appropriate
population_indicators_pivot.fillna(0, inplace=True)

# Step 6: Validate the Processed Data
print(population_indicators_pivot.info())
print(population_indicators_pivot.head())


In [None]:
# Load data
Guests_by_Hotel_Type_by_Region = pd.read_csv('Tourism/Guests_by_Hotel_Type_by_Region.csv')

In [None]:
Guests_by_Hotel_Type_by_Region.head()

In [None]:
Guests_by_Hotel_Type_by_Region['TIME_PERIOD'].unique()

In [None]:
Guests_by_Hotel_Type_by_Region['H_TYPE'].unique()

In [None]:
Guests_by_Hotel_Type_by_Region['GUEST_REGION'].unique()

In [None]:
# Step 1: Filter out relevant columns
guests_data = Guests_by_Hotel_Type_by_Region[['TIME_PERIOD', 'GUEST_REGION', 'OBS_VALUE']]

# Step 2: Pivot the table to have regions as columns
guests_pivot = guests_data.pivot_table(index='TIME_PERIOD', columns='GUEST_REGION', values='OBS_VALUE', aggfunc='sum')

# Step 3: Reset the index to make 'TIME_PERIOD' a column
guests_pivot.reset_index(inplace=True)

# Step 4: Rename the columns to make them more descriptive
guests_pivot.rename(columns={'TIME_PERIOD': 'Year'}, inplace=True)

# Now, the dataframe will have a structure with Year and each region's guest values


In [None]:
guests_pivot.head(20)

In [None]:
# Column Rename
# Step 5: Rename the columns with 'GuestNumber_' prefix
guests_pivot.columns = ['Year'] + ['GuestNumber_' + col for col in guests_pivot.columns[1:]]


In [None]:
guests_pivot.head()

In [None]:
guests_pivot.info()

In [None]:
# Merge
merged_data7 = merged_data6.merge(guests_pivot, on="Year", how="left")

In [None]:
merged_data7.info()

In [None]:
# Load data
Hotel_Establishments_and_Rooms_by_Rating_Type = pd.read_csv('Tourism/Hotel_Establishments_and_Rooms_by_Rating_Type.csv')

In [None]:
Hotel_Establishments_and_Rooms_by_Rating_Type.head()

In [None]:
Hotel_Establishments_and_Rooms_by_Rating_Type['MEASURE'].unique()

In [None]:
Hotel_Establishments_and_Rooms_by_Rating_Type['REF_AREA'].unique()

In [None]:
Hotel_Establishments_and_Rooms_by_Rating_Type['H_TYPE'].unique()

In [None]:
Hotel_Establishments_and_Rooms_by_Rating_Type['H_INDICATOR'].unique()

In [None]:
# Step 1: Prepare the Data
hotel_data = Hotel_Establishments_and_Rooms_by_Rating_Type[['TIME_PERIOD', 'H_TYPE', 'H_INDICATOR', 'OBS_VALUE']].copy()

# Step 2: Rename the columns for better understanding
hotel_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'Count'}, inplace=True)

# Step 3: Handle Duplicates by Aggregating the Data
# Aggregate using .sum() for duplicate entries of Year, H_TYPE, and H_INDICATOR
hotel_data = hotel_data.groupby(['Year', 'H_TYPE', 'H_INDICATOR'], as_index=False).sum()

# Step 4: Pivot the Data
# We pivot based on 'Year' as the index, and 'H_TYPE' and 'H_INDICATOR' as columns
hotel_data_pivot = hotel_data.pivot_table(
    index='Year',
    columns=['H_TYPE', 'H_INDICATOR'],
    values='Count',
    aggfunc='sum'
).reset_index()

# Step 5: Handle Missing Values (Optional)
# Fill missing values with 0 or another placeholder
hotel_data_pivot.fillna(0, inplace=True)

# Step 6: Validate the Processed Data
print(hotel_data_pivot.info())
print(hotel_data_pivot.head())


In [None]:
# Flatten the columns by renaming the 'Year' column
hotel_data_pivot.columns = ['Year' if col == ('Year', '') else col[0] + '_' + col[1] for col in hotel_data_pivot.columns]

# Now, verify the columns again
print(hotel_data_pivot.columns)

In [None]:
# Merge
merged_data8 = merged_data7.merge(hotel_data_pivot, on="Year", how="left")

In [None]:
merged_data8.info()

In [None]:
# Load data
Hotel_Establishments_Main_Indicators = pd.read_csv('Tourism/Hotel_Establishments_Main_Indicators.csv')

In [None]:
# Step 1: Prepare the Data
hotel_main_data = Hotel_Establishments_Main_Indicators[['TIME_PERIOD', 'H_TYPE', 'H_INDICATOR', 'OBS_VALUE']].copy()

# Step 2: Rename the columns for better understanding
hotel_main_data.rename(columns={'TIME_PERIOD': 'Year', 'OBS_VALUE': 'Revenue'}, inplace=True)

# Step 3: Handle Duplicates by Aggregating the Data
# Aggregate using .sum() for duplicate entries of Year, H_TYPE, and H_INDICATOR
hotel_main_data = hotel_main_data.groupby(['Year', 'H_TYPE', 'H_INDICATOR'], as_index=False).sum()

# Step 4: Pivot the Data
# We pivot based on 'Year' as the index, and 'H_TYPE' and 'H_INDICATOR' as columns
hotel_main_data_pivot = hotel_main_data.pivot_table(
    index='Year',
    columns=['H_TYPE', 'H_INDICATOR'],
    values='Revenue',
    aggfunc='sum'
).reset_index()

# Step 5: Handle Missing Values (Optional)
# Fill missing values with 0 or another placeholder
hotel_main_data_pivot.fillna(0, inplace=True)

# Step 6: Validate the Processed Data
print(hotel_main_data_pivot.info())
print(hotel_main_data_pivot.head())


In [None]:
'''
RR - Total Revenue (likely the most important indicator)
TOR - Total Occupancy Rate (percentage of rooms occupied)
GUN - Gross Utilization Number (likely measures the occupancy or usage rate of hotel resources)
LS - Length of Stay (average duration of stay for guests)
FB - Food & Beverage Revenue (likely indicates revenue from dining services)
AR - Average Room Rate (the average price of a room per night)
OR - Occupancy Rate (similar to TOR, but could be more specific in context)
TR - Total Rooms (total number of rooms in the establishment)
TAR - Total Available Rooms (could indicate the number of rooms available for booking)
ARR - Average Room Revenue (average revenue per room)
'''

In [None]:
# Flatten the columns by renaming the 'Year' column
hotel_main_data_pivot.columns = ['Year' if col == ('Year', '') else col[0] + '_' + col[1] for col in hotel_main_data_pivot.columns]

# Now, verify the columns again
print(hotel_main_data_pivot.columns)

In [None]:
# Merge
merged_data9 = merged_data8.merge(hotel_main_data_pivot, on="Year", how="left")

In [None]:
merged_data9.info()

In [None]:
# Load data
World_Development_Indicator = pd.read_csv('World Development Indicators/World_Development_Indicator.csv',
                                          skiprows=4,            # Skip the first 4 rows
                                          delimiter=",",         # Specify the delimiter
                                          quotechar='"',         # Handle quoted fields
                                          engine="python"        # Use Python engine for flexibility
                                         )

In [None]:
# Step 1: Melt the Data
world_dev_data = World_Development_Indicator.drop(columns=['Country Code','Country Name', 'Unnamed: 68'])  # Drop any irrelevant columns
world_dev_data_melted = world_dev_data.melt(id_vars=['Indicator Name'],
                                            var_name='Year', 
                                            value_name='Value')

# Step 2: Pivot the Data
# We will pivot the data so that each row corresponds to a specific indicator and year for a country
world_dev_data_pivot = world_dev_data_melted.pivot_table(
    index=[ 'Year'],
    columns='Indicator Name',
    values='Value',
    aggfunc='first'  # Take the first value in case of duplicates
).reset_index()

# Step 3: Clean the Data (Optional)
# You can fill missing values with NaN or 0, depending on your preference
world_dev_data_pivot.fillna(0, inplace=True)

# Step 4: Validate the Processed Data
print(world_dev_data_pivot.info())
print(world_dev_data_pivot.head())


In [None]:
# Step 1: Replace 0 values with NaN to treat them as missing
world_dev_data_pivot_no_zeros = world_dev_data_pivot.replace(0, pd.NA)

# Step 2: Calculate the percentage of missing values for each indicator (column), considering 0 as missing
missing_percentage = world_dev_data_pivot_no_zeros.drop(columns=['Year']).isnull().mean() * 100

# Step 3: Debugging: Check the percentage of missing values for the first few indicators
print(missing_percentage.head())

# Step 4: Filter out indicators with missing data above a certain threshold (e.g., 5%)
threshold = 4
selected_columns = missing_percentage[missing_percentage < threshold].index

# Step 5: Debugging: Check the selected columns after applying the threshold
print(f"Selected columns after thresholding: {selected_columns.tolist()}")

# Step 6: Filter the dataset to keep 'Year' and the selected columns
world_dev_data_selected = world_dev_data_pivot[['Year'] + selected_columns.tolist()]

# Step 7: Validate the filtered data
print(world_dev_data_selected.info())
print(world_dev_data_selected.head())


In [None]:
world_dev_data_selected.info()

In [None]:
# List of desired columns to keep
columns_to_keep = [
    'Year',
    'Birth rate, crude (per 1,000 people)',
    'Death rate, crude (per 1,000 people)',
    'Fertility rate, total (births per woman)',
    'Life expectancy at birth, total (years)',
    'Net migration',
    'Population, total',
    'Rural population',
    'Urban population'
]

# Filter the dataframe to keep only the desired columns
world_dev_data_filtered = world_dev_data_selected[columns_to_keep]

# Validate the result
world_dev_data_filtered.info()

In [None]:
world_dev_data_filtered.head()

In [None]:
# Step 1: Convert columns to numeric using .loc[]
columns_to_convert = [
    'Year',
    'Birth rate, crude (per 1,000 people)', 
    'Death rate, crude (per 1,000 people)', 
    'Fertility rate, total (births per woman)', 
    'Life expectancy at birth, total (years)', 
    'Net migration', 
    'Population, total', 
    'Rural population', 
    'Urban population'
]

# Convert columns to numeric, coercing errors to NaN
for col in columns_to_convert:
    world_dev_data_filtered.loc[:, col] = pd.to_numeric(world_dev_data_filtered[col], errors='coerce')

# Check the result
print(world_dev_data_filtered.info())
print(world_dev_data_filtered.head())

In [None]:
# Then, convert it to an integer, replacing NaN with a placeholder (e.g., 0)
world_dev_data_filtered['Year'] = world_dev_data_filtered['Year'].fillna(0).astype(int)

In [None]:
world_dev_data_filtered.head()

In [None]:

# Step 2: Rename columns for clarity
world_dev_data_filtered.rename(columns={
    'Birth rate, crude (per 1,000 people)': 'BirthRate_crude',
    'Death rate, crude (per 1,000 people)': 'DeathRate_crude',
    'Fertility rate, total (births per woman)': 'FertilityRate_total',
    'Life expectancy at birth, total (years)': 'LifeExpectancy_birth',
    'Net migration': 'NetMigration',
    'Population, total': 'Population_total',
    'Rural population': 'RuralPopulation',
    'Urban population': 'UrbanPopulation'
}, inplace=True)


In [None]:
# Merge
final_merged_data = merged_data9.merge(world_dev_data_filtered, on="Year", how="left")

In [None]:
final_merged_data.info()

In [None]:
final_merged_data.columns

In [None]:
# final_merged_data.to_csv('final_merged_data.csv', index=False)


In [None]:
######################################################

In [None]:
# Model - Annual Amount on Final Merged Data

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Prepare the data
# Drop rows where 'Annual Amount' is missing
final_merged_data = final_merged_data.dropna(subset=['Annual Amount'])

# Define features and target
X = final_merged_data.drop(columns=['Annual Amount', 'Ejari Contract Number', 'Registration Date', 'Start Date', 'End Date', 'Transaction Date'])
y = final_merged_data['Annual Amount']

# 2. Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Catboost Model
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Handle NaN in categorical features
cat_columns = X_train.select_dtypes(include=['object']).columns
for col in cat_columns:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# Categorical features: Get the indices of the categorical columns in X_train
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype == 'object']

# Step 1: Define the CatBoost model
model = CatBoostRegressor(iterations=200,  # Number of boosting iterations
                          learning_rate=0.1,  # Learning rate
                          depth=6,  # Tree depth
                          cat_features=cat_features,  # List of categorical feature indices
                          random_seed=42,  # Random seed for reproducibility
                          verbose=200)  # Print progress every 200 iterations

# Step 2: Train the model
model.fit(X_train, y_train, cat_features=cat_features)

# Step 3: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2:.4f}")  # R-squared score

In [None]:
# Feature Importance

import matplotlib.pyplot as plt

# Get feature importance from the trained model
feature_importance = model.get_feature_importance()

# Create a DataFrame to map feature names with importance scores
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the importance values in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 30 (or change to 40 if needed)
top_n = 30  # Change to 40 if needed
top_features_df = feature_importance_df.head(top_n)

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(top_features_df['Feature'], top_features_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance of Annual Amount for CatBoost (Final Merged Data)')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important features on top
plt.show()


In [None]:
# LightGBM Model
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Step 1: Convert categorical columns to category dtype
categorical_columns = ['Version', 'Area', 'Is Free Hold?', 'Property Type', 
                       'Property Sub Type', 'Usage', 'Nearest Metro', 
                       'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project',
                      'Transaction Number', 'Transaction Type', 'Transaction sub type',
                       'Registration type', 'Room(s)',
                       'BirthRate_crude', 'DeathRate_crude',
       'FertilityRate_total', 'LifeExpectancy_birth', 'NetMigration',
       'Population_total', 'RuralPopulation', 'UrbanPopulation'
                      ]

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Step 2: Define the LightGBM model
lgb_model = lgb.LGBMRegressor(objective='regression', 
                              num_iterations=200,  # Number of boosting iterations
                              learning_rate=0.1,  # Learning rate
                              max_depth=6,  # Tree depth
                              verbose=-1,
                              random_state=42)  # Random seed for reproducibility

# Step 3: Train the model with categorical features
lgb_model.fit(X_train, y_train, categorical_feature=categorical_columns)

# Step 4: Make predictions on the test set
y_pred_lgb = lgb_model.predict(X_test)

# Step 5: Evaluate the model
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
rmse_lgb = mse_lgb ** 0.5
r2_lgb = r2_score(y_test, y_pred_lgb)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse_lgb:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse_lgb:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2_lgb:.4f}")  # R-squared score


In [None]:
# Step 6: Plot feature importance for LightGBM
lgb_feature_importance = lgb_model.feature_importances_

# Create a DataFrame to map feature names with importance scores
lgb_feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': lgb_feature_importance
})

# Sort the importance values in ascending order
lgb_feature_importance_df = lgb_feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 30 (or change to 40 if needed)
top_n = 30  # Change to 40 if needed
lgb_top_features_df = lgb_feature_importance_df.head(top_n)

# Plot LightGBM feature importance
plt.figure(figsize=(10, 6))
plt.barh(lgb_top_features_df['Feature'], lgb_top_features_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance of Annual Amount for LightGBM (Final Merged Data)')
plt.gca().invert_yaxis()  # Most important features on top
plt.show()

In [None]:
##################################################

In [None]:
# Model - Amount on Sales Data - ['Transaction Type'] == 'Sales']

In [None]:
# Filter the data to include only 'Sales' transaction type
merged_sales_data = final_merged_data[final_merged_data['Transaction Type'] == 'Sales']

# Check the result
print(merged_sales_data.info())
print(merged_sales_data.head())

In [None]:
# merged_sales_data.to_csv('merged_sales_data.csv', index=False)


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Prepare the data
# Drop rows where 'Annual Amount' is missing
merged_sales_data = merged_sales_data.dropna(subset=['Amount'])

# Define features and target
X = merged_sales_data.drop(columns=['Amount', 'Ejari Contract Number', 'Registration Date', 'Start Date', 'End Date', 'Transaction Date'])
y = merged_sales_data['Amount']

# 2. Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Catboost Model
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Handle NaN in categorical features
cat_columns = X_train.select_dtypes(include=['object']).columns
for col in cat_columns:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# Categorical features: Get the indices of the categorical columns in X_train
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype == 'object']

# Step 1: Define the CatBoost model
model = CatBoostRegressor(iterations=200,  # Number of boosting iterations
                          learning_rate=0.1,  # Learning rate
                          depth=6,  # Tree depth
                          cat_features=cat_features,  # List of categorical feature indices
                          random_seed=42,  # Random seed for reproducibility
                          verbose=200)  # Print progress every 200 iterations

# Step 2: Train the model
model.fit(X_train, y_train, cat_features=cat_features)

# Step 3: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2:.4f}")  # R-squared score


In [None]:
# Feature Importance

import matplotlib.pyplot as plt

# Get feature importance from the trained model
feature_importance = model.get_feature_importance()

# Create a DataFrame to map feature names with importance scores
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the importance values in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 30 (or change to 40 if needed)
top_n = 30  # Change to 40 if needed
top_features_df = feature_importance_df.head(top_n)

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(top_features_df['Feature'], top_features_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance of Amount for CatBoost (Final Merged Data (Sales))')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important features on top
plt.show()


In [None]:
# LightGBM Model
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Step 1: Convert categorical columns to category dtype
categorical_columns = ['Version', 'Area', 'Is Free Hold?', 'Property Type', 
                       'Property Sub Type', 'Usage', 'Nearest Metro', 
                       'Nearest Mall', 'Nearest Landmark', 'Master Project', 'Project',
                      'Transaction Number', 'Transaction Type', 'Transaction sub type',
                       'Registration type', 'Room(s)',
                       'BirthRate_crude', 'DeathRate_crude',
       'FertilityRate_total', 'LifeExpectancy_birth', 'NetMigration',
       'Population_total', 'RuralPopulation', 'UrbanPopulation'
                      ]

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Step 2: Define the LightGBM model
lgb_model = lgb.LGBMRegressor(objective='regression', 
                              num_iterations=200,  # Number of boosting iterations
                              learning_rate=0.1,  # Learning rate
                              max_depth=6,  # Tree depth
                              verbose=-1,
                              random_state=42)  # Random seed for reproducibility

# Step 3: Train the model with categorical features
lgb_model.fit(X_train, y_train, categorical_feature=categorical_columns)

# Step 4: Make predictions on the test set
y_pred_lgb = lgb_model.predict(X_test)

# Step 5: Evaluate the model
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
rmse_lgb = mse_lgb ** 0.5
r2_lgb = r2_score(y_test, y_pred_lgb)  # Calculate R² Score

# Print the evaluation metrics
print(f"MSE: {mse_lgb:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse_lgb:.4f}")  # Root Mean Squared Error
print(f"R² Score: {r2_lgb:.4f}")  # R-squared score


In [None]:
# Step 6: Plot feature importance for LightGBM
lgb_feature_importance = lgb_model.feature_importances_

# Create a DataFrame to map feature names with importance scores
lgb_feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': lgb_feature_importance
})

# Sort the importance values in ascending order
lgb_feature_importance_df = lgb_feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 30 (or change to 40 if needed)
top_n = 30  # Change to 40 if needed
lgb_top_features_df = lgb_feature_importance_df.head(top_n)

# Plot LightGBM feature importance
plt.figure(figsize=(10, 6))
plt.barh(lgb_top_features_df['Feature'], lgb_top_features_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance of Amount for LightGBM (Final Merged Data (Sales))')
plt.gca().invert_yaxis()  # Most important features on top
plt.show()

In [None]:
# Explore the correlations between these macroeconomic factors and property sale or rental prices. 
# Highlight the most significant factors driving market behavior.

In [None]:
# Show all column names
print(merged_sales_data.columns.tolist())


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# List of macroeconomic factors (adjust based on dataset)
macro_factors = [
    '_Z_TOR', '_Z_TR', 'BirthRate_crude', 'DeathRate_crude', 'FertilityRate_total',
    'LifeExpectancy_birth', 'NetMigration', 'Population_total', 
    'RuralPopulation', 'UrbanPopulation'
]

# Select only relevant columns
macro_data = merged_sales_data[macro_factors + ['Annual Amount', 'Amount']]

# Compute correlation matrix
correlation_matrix = macro_data.corr()

# Extract correlation with 'Annual Amount' and 'Amount'
correlation_annual = correlation_matrix['Annual Amount'].drop(['Annual Amount', 'Amount'])
correlation_amount = correlation_matrix['Amount'].drop(['Annual Amount', 'Amount'])

# Display correlation values
print("Correlation with 'Annual Amount':\n", correlation_annual)
print("\nCorrelation with 'Amount':\n", correlation_amount)

# Visualizing correlations with heatmaps
plt.figure(figsize=(10, 5))

# Heatmap for 'Annual Amount'
plt.subplot(1, 2, 1)
sns.heatmap(correlation_annual.to_frame(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation with Annual Amount")

# Heatmap for 'Amount'
plt.subplot(1, 2, 2)
sns.heatmap(correlation_amount.to_frame(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation with Amount")

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# List of macroeconomic factors (adjust based on dataset)
macro_factors = [
    'BirthRate_crude', 'DeathRate_crude', 'FertilityRate_total',
    'LifeExpectancy_birth', 'NetMigration',  
    'RuralPopulation', 'UrbanPopulation',
    'Contract Amount', 'Property Size (sq.m)', 'Transaction Size (sq.m)',
    'Prev_Month_Avg_Price', 'Prev_Week_Avg_Price',
    'CPI_ANNCHG', 'CPI_ANNCHG21', 'CPI_INDEX14', 'CPI_INDEX21', 'CPI_MTHCHG',
    'AEDUSD_Average_Close', 'AEDUSD_Average_Return', 'TOT_GDP', 'TOT_NO', 'TOT_GDP_current', 'TOT_NO_current',
    'Population_F', 'Population_M', 'Population_total'
]

# Select only relevant columns
macro_data = merged_sales_data[macro_factors + ['Annual Amount', 'Amount']]

# Compute correlation matrix
correlation_matrix = macro_data.corr()

# Extract correlation with 'Annual Amount' and 'Amount'
correlation_annual = correlation_matrix['Annual Amount'].drop(['Annual Amount', 'Amount'])
correlation_amount = correlation_matrix['Amount'].drop(['Annual Amount', 'Amount'])

# Display correlation values
print("Correlation with 'Annual Amount':\n", correlation_annual)
print("\nCorrelation with 'Amount':\n", correlation_amount)

# Visualizing correlations with heatmaps
plt.figure(figsize=(10, 5))

# Heatmap for 'Annual Amount'
plt.subplot(1, 2, 1)
sns.heatmap(correlation_annual.to_frame(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation with Annual Amount")

# Heatmap for 'Amount'
plt.subplot(1, 2, 2)
sns.heatmap(correlation_amount.to_frame(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation with Amount")

plt.tight_layout()
plt.show()
