In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load Rents data
Rents = pd.read_csv('RentsTransactions/rents.csv', delimiter=';', low_memory=False)

In [None]:
# Load Transactions Data
Transactions = pd.read_csv('RentsTransactions/transactions.csv', delimiter=';', low_memory=False)

In [None]:
Rents.head()

In [None]:
Rents.info()

In [None]:
# Quick Look at the Data

In [None]:
# Check missing values
missing_values = Rents.isnull().sum()
missing_percentage = (missing_values / len(Rents)) * 100

# Combine results into a summary table
missing_summary = pd.DataFrame({
    'Column': Rents.columns,
    'Missing Values': missing_values,
    'Percentage Missing (%)': missing_percentage
}).sort_values(by='Percentage Missing (%)', ascending=False)

missing_summary.reset_index(drop=True, inplace=True)
missing_summary

In [None]:
# Intuitively Important Columns - Outlier Detection and Handling

In [None]:
Rents['Annual Amount'].head()

In [None]:
Rents['Contract Amount'].head()

In [None]:
Rents['Property Size (sq.m)'].head()

In [None]:
Rents['Annual Amount'].describe()

In [None]:
Rents['Contract Amount'].describe()

In [None]:
Rents['Property Size (sq.m)'].describe()

In [None]:
# Convert 'Property Size (sq.m)' and 'Annual Amount' to numeric
Rents['Property Size (sq.m)'] = pd.to_numeric(Rents['Property Size (sq.m)'], errors='coerce')
Rents['Annual Amount'] = pd.to_numeric(Rents['Annual Amount'], errors='coerce')
Rents['Contract Amount'] = pd.to_numeric(Rents['Contract Amount'], errors='coerce')

# Drop rows with NaN values resulting from the conversion (if any)
Rents = Rents.dropna(subset=['Property Size (sq.m)', 'Annual Amount', 'Contract Amount'])

In [None]:
# Annual Amount
# Plot the filtered data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Annual Amount'])
plt.title("Boxplot of Annual Amount")
plt.show()

In [None]:
# Calculate IQR for 'Amount'
Q1 = Rents['Annual Amount'].quantile(0.25)
Q3 = Rents['Annual Amount'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows with outliers in 'Amount'
filtered_rents_amount = Rents[(Rents['Annual Amount'] >= lower_bound) & (Rents['Annual Amount'] <= upper_bound)]

# Plot the filtered data for 'Amount'
plt.figure(figsize=(10, 6))
sns.boxplot(x=filtered_rents_amount['Annual Amount'])
plt.title("Boxplot of Annual Amount Without Outliers")
plt.show()


In [None]:
# Cap the 'Property Size (sq.m)' values at the 95th percentile
cap_value = Rents['Annual Amount'].quantile(0.95)
Rents['Capped Annual Amount'] = Rents['Annual Amount'].clip(upper=cap_value)

# Plot the capped data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Capped Annual Amount'])
plt.title("Boxplot of Annual Amount with Capped Outliers")
plt.show()

In [None]:
# Cap the 'Annual Amount' values at the 99th percentile
cap_value = Rents['Annual Amount'].quantile(0.99)
Rents['Capped Annual Amount'] = Rents['Annual Amount'].clip(upper=cap_value)

# Plot the capped data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Capped Annual Amount'])
plt.title("Boxplot of Annual Amount with Capped Outliers")
plt.show()

In [None]:
# Contract Amount
# Plot the filtered data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Contract Amount'])
plt.title("Boxplot of Contract Amount")
plt.show()

In [None]:
# Calculate IQR for 'Amount'
Q1 = Rents['Contract Amount'].quantile(0.25)
Q3 = Rents['Contract Amount'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows with outliers in 'Amount'
filtered_rents_amount = Rents[(Rents['Contract Amount'] >= lower_bound) & (Rents['Contract Amount'] <= upper_bound)]

# Plot the filtered data for 'Amount'
plt.figure(figsize=(10, 6))
sns.boxplot(x=filtered_rents_amount['Contract Amount'])
plt.title("Boxplot of Contract Amount Without Outliers")
plt.show()


In [None]:
# Cap the 'Contract Amount' values at the 95th percentile
cap_value = Rents['Contract Amount'].quantile(0.95)
Rents['Capped Contract Amount'] = Rents['Contract Amount'].clip(upper=cap_value)

# Plot the capped data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Capped Contract Amount'])
plt.title("Boxplot of Contract Amount with Capped Outliers")
plt.show()

In [None]:
# Cap the 'Annual Amount' values at the 99th percentile
cap_value = Rents['Contract Amount'].quantile(0.99)
Rents['Capped Contract Amount'] = Rents['Contract Amount'].clip(upper=cap_value)

# Plot the capped data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Capped Contract Amount'])
plt.title("Boxplot of Contract Amount with Capped Outliers")
plt.show()

In [None]:
# Property Size (sq.m)
# Plot the filtered data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Property Size (sq.m)'])
plt.title("Boxplot of Property Size (sq.m)")
plt.show()

In [None]:
# Calculate IQR for 'Property Size (sq.m)'
Q1 = Rents['Property Size (sq.m)'].quantile(0.25)
Q3 = Rents['Property Size (sq.m)'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows with outliers
filtered_rents = Rents[(Rents['Property Size (sq.m)'] >= lower_bound) & (Rents['Property Size (sq.m)'] <= upper_bound)]

# Plot the filtered data
plt.figure(figsize=(10, 6))
sns.boxplot(x=filtered_rents['Property Size (sq.m)'])
plt.title("Boxplot of Property Size (sq.m) Without Outliers")
plt.show()


In [None]:
# Cap the 'Property Size (sq.m)' values at the 95th percentile
cap_value = Rents['Property Size (sq.m)'].quantile(0.95)
Rents['Capped Property Size (sq.m)'] = Rents['Property Size (sq.m)'].clip(upper=cap_value)

# Plot the capped data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Capped Property Size (sq.m)'])
plt.title("Boxplot of Property Size (sq.m) with Capped Outliers")
plt.show()


In [None]:
# Cap the 'Property Size (sq.m)' values at the 99th percentile
cap_value = Rents['Property Size (sq.m)'].quantile(0.99)
Rents['Capped Property Size (sq.m)'] = Rents['Property Size (sq.m)'].clip(upper=cap_value)

# Plot the capped data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Capped Property Size (sq.m)'])
plt.title("Boxplot of Property Size (sq.m) with Capped Outliers")
plt.show()


In [None]:
# Log transformation to reduce skewness
Rents['Log Property Size (sq.m)'] = np.log1p(Rents['Property Size (sq.m)'])

# Plot the log-transformed data
plt.figure(figsize=(10, 6))
sns.boxplot(x=Rents['Log Property Size (sq.m)'])
plt.title("Boxplot of Log-Transformed Property Size (sq.m)")
plt.show()


In [None]:
# Plot histograms for numeric columns
numeric_columns = ['Contract Amount', 'Annual Amount', 'Property Size (sq.m)']
for col in numeric_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(Rents[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# Sample the data to reduce the number of points plotted
sampled_rents = Rents.sample(n=10000, random_state=42)  # Adjust sample size as needed

In [None]:
# Convert 'Property Size (sq.m)' and 'Annual Amount' to numeric
sampled_rents['Property Size (sq.m)'] = pd.to_numeric(sampled_rents['Property Size (sq.m)'], errors='coerce')
sampled_rents['Annual Amount'] = pd.to_numeric(sampled_rents['Annual Amount'], errors='coerce')
sampled_rents['Contract Amount'] = pd.to_numeric(sampled_rents['Contract Amount'], errors='coerce')

# Drop rows with NaN values resulting from the conversion (if any)
sampled_rents = sampled_rents.dropna(subset=['Property Size (sq.m)', 'Annual Amount'])

In [None]:
# Temporal Analysis

# Insights into market dynamics, lease preferences, and consumer sentiment over the years

# Convert dates to datetime
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')
Rents['Start Date'] = pd.to_datetime(Rents['Start Date'], errors='coerce')
Rents['End Date'] = pd.to_datetime(Rents['End Date'], errors='coerce')

# Extract year from dates
Rents['Registration Year'] = Rents['Registration Date'].dt.year
Rents['Start Year'] = Rents['Start Date'].dt.year
Rents['End Year'] = Rents['End Date'].dt.year

# Countplot for Registration Year
sns.countplot(
    x='Registration Year', 
    data=Rents, 
    palette='viridis', 
    order=Rents['Registration Year'].value_counts().sort_index().index
)
plt.title("Number of Registrations per Year")
plt.xlabel("Registration Year")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Countplot for Start Year
sns.countplot(
    x='Start Year', 
    data=Rents, 
    palette='coolwarm', 
    order=Rents['Start Year'].value_counts().sort_index().index
)
plt.title("Number of Contracts Starting per Year")
plt.xlabel("Start Year")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Countplot for End Year
sns.countplot(
    x='End Year', 
    data=Rents, 
    palette='cubehelix', 
    order=Rents['End Year'].value_counts().sort_index().index
)
plt.title("Number of Contracts Ending per Year")
plt.xlabel("End Year")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Line plots for trends
trend_data = (
    Rents.groupby(['Start Year'])
    .size()
    .reset_index(name='Contracts Starting')
    .merge(
        Rents.groupby(['End Year'])
        .size()
        .reset_index(name='Contracts Ending'),
        left_on='Start Year',
        right_on='End Year',
        how='outer'
    )
    .rename(columns={'Start Year': 'Year'})
    .sort_values(by='Year')
)

trend_data.plot(
    x='Year', 
    y=['Contracts Starting', 'Contracts Ending'], 
    kind='line', 
    marker='o', 
    figsize=(12, 6), 
    color=['blue', 'orange']
)
plt.title("Trends in Contract Start and End Over Years")
plt.ylabel("Number of Contracts")
plt.xlabel("Year")
plt.legend(title="Trend")
plt.grid()
plt.show()


In [None]:
# Contract Duration (Days) - Contract Duration (Months)

# Calculate contract duration in days and months
Rents['Contract Duration (Days)'] = (Rents['End Date'] - Rents['Start Date']).dt.days
Rents['Contract Duration (Months)'] = Rents['Contract Duration (Days)'] / 30.4375  # Approx. days per month

# Ensure 'Contract Amount' is a string, then remove commas and convert to float
Rents['Contract Amount'] = Rents['Contract Amount'].astype(str).str.replace(',', '').astype(float)

# Recompute Contract Amount per Month
Rents['Contract Amount per Month'] = Rents['Contract Amount'] / Rents['Contract Duration (Months)']

# Ensure numerical columns don't contain missing or non-numeric values
Rents['Contract Duration (Months)'] = pd.to_numeric(Rents['Contract Duration (Months)'], errors='coerce')
Rents['Contract Amount per Month'] = pd.to_numeric(Rents['Contract Amount per Month'], errors='coerce')

# Check the distribution of Contract Duration and Contract Amount per Month
sns.histplot(Rents['Contract Duration (Months)'], kde=True, bins=30)
plt.title("Distribution of Contract Duration (Months)")
plt.show()

sns.histplot(Rents['Contract Amount per Month'], kde=True, bins=30)
plt.title("Distribution of Contract Amount per Month")
plt.show()



In [None]:
Rents['Version'].unique()

In [None]:
Rents['Area'].unique()

In [None]:
# Geographic Analysis
# Examine the impact of Area on Annual Amount

# Remove commas and convert Annual Amount to numeric
# Rents['Annual Amount'] = Rents['Annual Amount'].str.replace(',', '').astype(float)

# Find the top 10 most frequent areas
top_areas = Rents['Area'].value_counts().head(10).index

# Filter data for the top areas
filtered_Rents = Rents[Rents['Area'].isin(top_areas)]

# Average Annual Amount by Top Areas
area_avg = filtered_Rents.groupby('Area')['Annual Amount'].mean().sort_values(ascending=False)
area_avg.plot(kind='bar', figsize=(12, 6))
plt.title("Average Annual Amount by Top 10 Areas")
plt.ylabel("Average Annual Amount")
plt.show()


In [None]:
# Geographic Analysis
# Examine the impact of Area on Annual Amount

# Remove commas and convert Annual Amount to numeric
# Rents['Annual Amount'] = Rents['Annual Amount'].str.replace(',', '').astype(float)

# Find the top 10 most frequent areas
top_areas = Rents['Area'].value_counts().head(10).index

# Filter data for the top areas
filtered_Rents = Rents[Rents['Area'].isin(top_areas)]

# Average Annual Amount by Top Areas
area_avg = filtered_Rents.groupby('Area')['Contract Amount'].mean().sort_values(ascending=False)
area_avg.plot(kind='bar', figsize=(12, 6))
plt.title("Average Contract Amount by Top 10 Areas")
plt.ylabel("Average Contract Amount")
plt.show()


In [None]:
# Geographic Analysis
# Examine the impact of Area on Annual Amount

# Remove commas and convert Annual Amount to numeric
# Rents['Annual Amount'] = Rents['Annual Amount'].str.replace(',', '').astype(float)

# Find the top 10 most frequent areas
top_areas2 = Rents['Area'].value_counts().head(20).index

# Filter data for the top areas
filtered_Rents2 = Rents[Rents['Area'].isin(top_areas2)]

# Average Annual Amount by Top Areas
area_avg2 = filtered_Rents2.groupby('Area')['Annual Amount'].mean().sort_values(ascending=False)
area_avg2.plot(kind='bar', figsize=(12, 6))
plt.title("Average Annual Amount by Top 20 Areas")
plt.ylabel("Average Annual Amount")
plt.show()


In [None]:
# Geographic Analysis
# Examine the impact of Area on Annual Amount

# Remove commas and convert Annual Amount to numeric
# Rents['Annual Amount'] = Rents['Annual Amount'].str.replace(',', '').astype(float)

# Find the top 10 most frequent areas
top_areas2 = Rents['Area'].value_counts().head(20).index

# Filter data for the top areas
filtered_Rents2 = Rents[Rents['Area'].isin(top_areas2)]

# Average Annual Amount by Top Areas
area_avg2 = filtered_Rents2.groupby('Area')['Contract Amount'].mean().sort_values(ascending=False)
area_avg2.plot(kind='bar', figsize=(12, 6))
plt.title("Average Contract Amount by Top 20 Areas")
plt.ylabel("Average Contract Amount")
plt.show()


In [None]:
# Get the top 10 most frequent Areas
top_areas = Rents['Area'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 areas
filtered_rents = Rents[Rents['Area'].isin(top_areas)]

# Grouping by Area and Version to see the impact of Area and Version on Annual Amount
area_version_avg = filtered_rents.groupby(['Area', 'Version'])['Annual Amount'].mean().unstack().fillna(0)

# Plot the Average Annual Amount by Area and Version (Top 10 Areas)
area_version_avg.plot(kind='bar', figsize=(12, 6), stacked=False, colormap='Set1')
plt.title("Average Annual Amount by Area and Version (Top 10 Areas)")
plt.ylabel("Average Annual Amount")
plt.xticks(rotation=45)
plt.xlabel("Area")
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Box Plot for Annual Amount Distribution by Area and Version (Top 10 Areas)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Annual Amount', hue='Version', data=filtered_rents, palette='muted')
plt.title("Annual Amount Distribution by Area and Version (Top 10 Areas)")
plt.xticks(rotation=45)
plt.show()

# Calculate Percentage Change in Average Annual Amount between Versions for Top 10 Areas
area_version_change = area_version_avg.pct_change(axis='columns') * 100

# Plot Percentage Change in Annual Amount for Top 10 Areas and Versions
area_version_change.plot(kind='bar', figsize=(12, 6), colormap='coolwarm')
plt.title("Percentage Change in Average Annual Amount by Area and Version (Top 10 Areas)")
plt.ylabel("Percentage Change (%)")
plt.xlabel("Area")
plt.xticks(rotation=45)
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Get the top 10 most frequent Areas
top_areas = Rents['Area'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 areas
filtered_rents = Rents[Rents['Area'].isin(top_areas)]

# Grouping by Area and Version to see the impact of Area and Version on Annual Amount
area_version_avg = filtered_rents.groupby(['Area', 'Version'])['Annual Amount'].mean().unstack().fillna(0)

# Plot the Average Annual Amount by Area and Version (Top 10 Areas)
area_version_avg.plot(kind='bar', figsize=(12, 6), stacked=True, colormap='Set1')  # stacked=True if comparison of parts of a whole is meaningful
plt.title("Average Annual Amount by Area and Version (Top 10 Areas)")
plt.ylabel("Average Annual Amount")
plt.xticks(rotation=45)
plt.xlabel("Area")
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()  # Ensure labels don't get cut off
plt.show()

# Box Plot for Annual Amount Distribution by Area and Version (Top 10 Areas)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Annual Amount', hue='Version', data=filtered_rents, palette='muted')
plt.title("Annual Amount Distribution by Area and Version (Top 10 Areas)")
plt.xticks(rotation=45)
plt.tight_layout()  # Ensure labels don't get cut off
plt.show()

# Calculate Percentage Change in Average Annual Amount between Versions for Top 10 Areas
area_version_change = area_version_avg.pct_change(axis='columns') * 100

# Plot Percentage Change in Annual Amount for Top 10 Areas and Versions
area_version_change.plot(kind='bar', figsize=(12, 6), colormap='coolwarm')
plt.title("Percentage Change in Average Annual Amount by Area and Version (Top 10 Areas)")
plt.ylabel("Percentage Change (%)")
plt.xlabel("Area")
plt.xticks(rotation=45)
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()  # Ensure labels don't get cut off
plt.show()


In [None]:
# Get the top 20 most frequent Areas
top_areas2 = Rents['Area'].value_counts().nlargest(20).index

# Filter the data to include only the top 20 areas
filtered_rents2 = Rents[Rents['Area'].isin(top_areas2)]

# Grouping by Area and Version to see the impact of Area and Version on Annual Amount
area_version_avg2 = filtered_rents2.groupby(['Area', 'Version'])['Annual Amount'].mean().unstack().fillna(0)

# Plot the Average Annual Amount by Area and Version (Top 20 Areas)
area_version_avg2.plot(kind='bar', figsize=(12, 6), stacked=False, colormap='Set1')
plt.title("Average Annual Amount by Area and Version (Top 20 Areas)")
plt.ylabel("Average Annual Amount")
plt.xticks(rotation=90)
plt.xlabel("Area")
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Box Plot for Annual Amount Distribution by Area and Version (Top 20 Areas)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Annual Amount', hue='Version', data=filtered_rents2, palette='muted')
plt.title("Annual Amount Distribution by Area and Version (Top 20 Areas)")
plt.xticks(rotation=90)
plt.show()

# Calculate Percentage Change in Average Annual Amount between Versions for Top 20 Areas
area_version_change2 = area_version_avg2.pct_change(axis='columns') * 100

# Plot Percentage Change in Annual Amount for Top 10 Areas and Versions
area_version_change2.plot(kind='bar', figsize=(12, 6), colormap='coolwarm')
plt.title("Percentage Change in Average Annual Amount by Area and Version (Top 20 Areas)")
plt.ylabel("Percentage Change (%)")
plt.xlabel("Area")
plt.xticks(rotation=90)
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Property Characteristics
Rents['Property Type'].unique()

In [None]:
# Property Characteristics
# Relationship of Property Type, Property Sub Type, and Usage with the target variables

# Filter and plot for Property Type
top_property_types = Rents['Property Type'].value_counts().head(10).index
filtered_Rents_type = Rents[Rents['Property Type'].isin(top_property_types)]
sns.boxplot(x='Property Type', y='Annual Amount', data=filtered_Rents_type)
plt.title("Annual Amount by Property Types")
plt.xticks(rotation=45)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set Seaborn style
sns.set_theme(style="whitegrid")

# Filter and plot for Property Type
top_property_types = Rents['Property Type'].value_counts().head(10).index
filtered_Rents_type = Rents[Rents['Property Type'].isin(top_property_types)]

# Create figure and axis
plt.figure(figsize=(12, 8))

# Plot violin+boxplot combo
sns.violinplot(
    x='Annual Amount', 
    y='Property Type', 
    data=filtered_Rents_type, 
    scale='width', 
    inner=None, 
    palette="muted"
)
sns.boxplot(
    x='Annual Amount', 
    y='Property Type', 
    data=filtered_Rents_type, 
    whis=1.5, 
    fliersize=0.5, 
    width=0.3, 
    palette="Set2"
)

# Title and labels
plt.title("Distribution of Annual Amount by Property Types", fontsize=16, fontweight='bold')
plt.xlabel("Annual Amount (AED)", fontsize=12)
plt.ylabel("Property Type", fontsize=12)

# Add gridlines
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Rotate y-ticks (for vertical plots, use x-ticks)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
Rents['Property Sub Type'].unique()

In [None]:
# Filter and plot for Property Sub Type
top_property_subtypes = Rents['Property Sub Type'].value_counts().head(10).index
filtered_Rents_subtype = Rents[Rents['Property Sub Type'].isin(top_property_subtypes)]
sns.boxplot(x='Property Sub Type', y='Annual Amount', data=filtered_Rents_subtype)
plt.title("Annual Amount by Top 10 Property Sub Types")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Filter and plot for Property Sub Type
top_property_subtypes2 = Rents['Property Sub Type'].value_counts().head(20).index
filtered_Rents_subtype2 = Rents[Rents['Property Sub Type'].isin(top_property_subtypes2)]
sns.boxplot(x='Property Sub Type', y='Annual Amount', data=filtered_Rents_subtype2)
plt.title("Annual Amount by Top 20 Property Sub Types")
plt.xticks(rotation=90)
plt.show()

In [None]:
Rents['Usage'].unique()

In [None]:
# Filter and plot for Usage
top_usages = Rents['Usage'].value_counts().head(10).index
filtered_Rents_usage = Rents[Rents['Usage'].isin(top_usages)]
sns.boxplot(x='Usage', y='Annual Amount', data=filtered_Rents_usage)
plt.title("Annual Amount by Usages")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Number of top categories to consider
top_n = 10

# Property Type - Top N Categories
top_property_types = Rents['Property Type'].value_counts().nlargest(top_n).index
filtered_rents_property_type = Rents[Rents['Property Type'].isin(top_property_types)]

# Box Plot for Annual Amount by Property Type (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Type', y='Annual Amount', data=filtered_rents_property_type, palette='Set2')
plt.title(f"Annual Amount by Property Type (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Property Sub Type - Top N Categories
top_property_sub_types = Rents['Property Sub Type'].value_counts().nlargest(top_n).index
filtered_rents_property_sub_type = Rents[Rents['Property Sub Type'].isin(top_property_sub_types)]

# Box Plot for Annual Amount by Property Sub Type (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Sub Type', y='Annual Amount', data=filtered_rents_property_sub_type, palette='Set1')
plt.title(f"Annual Amount by Property Sub Type (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Usage - Top N Categories
top_usage_types = Rents['Usage'].value_counts().nlargest(top_n).index
filtered_rents_usage = Rents[Rents['Usage'].isin(top_usage_types)]

# Box Plot for Annual Amount by Usage (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Usage', y='Annual Amount', data=filtered_rents_usage, palette='muted')
plt.title(f"Annual Amount by Usage (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Expanded Analysis: Adding the impact of 'Version' column

# Property Type and Version (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Type', y='Annual Amount', hue='Version', data=filtered_rents_property_type, palette='coolwarm')
plt.title(f"Annual Amount by Property Type and Version (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Property Sub Type and Version (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Sub Type', y='Annual Amount', hue='Version', data=filtered_rents_property_sub_type, palette='coolwarm')
plt.title(f"Annual Amount by Property Sub Type and Version (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Usage and Version (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Usage', y='Annual Amount', hue='Version', data=filtered_rents_usage, palette='coolwarm')
plt.title(f"Annual Amount by Usage and Version (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Expanded Analysis: Adding the impact of 'Area' column

# Property Type and Area (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Type', y='Annual Amount', hue='Area', data=filtered_rents_property_type, palette='coolwarm')
plt.title(f"Annual Amount by Property Type and Area (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Property Sub Type and Area (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Sub Type', y='Annual Amount', hue='Area', data=filtered_rents_property_sub_type, palette='coolwarm')
plt.title(f"Annual Amount by Property Sub Type and Area (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()

# Usage and Area (Top N Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Usage', y='Annual Amount', hue='Area', data=filtered_rents_usage, palette='coolwarm')
plt.title(f"Annual Amount by Usage and Area (Top {top_n} Categories)")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Freehold Analysis
# Assess whether being Free Hold impacts pricing

sns.boxplot(x='Is Free Hold?', y='Annual Amount', data=Rents)
plt.title("Annual Amount by Free Hold Status")
plt.show()


In [None]:

# Set a modern style
plt.figure(figsize=(8, 6))
sns.set_theme(style="whitegrid")

# Create the boxplot with enhanced styling
ax = sns.boxplot(
    x='Is Free Hold?', 
    y='Annual Amount', 
    data=Rents, 
    palette="pastel",
    width=0.6,
    showfliers=False  # Hide outliers for a cleaner look
)

# Use log scale for better distribution visualization (optional)
ax.set_yscale("log")

# Customize labels and title
plt.xlabel("Free Hold Status", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Impact of Free Hold Status on Annual Amount", fontsize=14, fontweight="bold")

# Show plot
plt.show()


In [None]:
# Box Plot for Annual Amount by Free Hold Status
# First, we'll check the distribution of 'Is Free Hold?' values to ensure they are meaningful
free_hold_status_counts = Rents['Is Free Hold?'].value_counts()

# If there are more than 2 categories, let's focus on the top ones for clarity
top_free_hold_status = free_hold_status_counts.nlargest(2).index
filtered_rents_free_hold = Rents[Rents['Is Free Hold?'].isin(top_free_hold_status)]

# Box Plot for Annual Amount by Free Hold Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='Is Free Hold?', y='Annual Amount', data=filtered_rents_free_hold, palette='pastel')
plt.title("Annual Amount by Free Hold Status")
plt.show()

# Expanded Analysis: Adding 'Version' as a factor for deeper insight
# Box Plot for Annual Amount by Free Hold Status and Version
plt.figure(figsize=(12, 7))
sns.boxplot(x='Is Free Hold?', y='Annual Amount', hue='Version', data=filtered_rents_free_hold, palette='coolwarm')
plt.title("Annual Amount by Free Hold Status and Version")
plt.show()

# Expanded Analysis: Including Property Type for further exploration
plt.figure(figsize=(12, 7))
sns.boxplot(x='Is Free Hold?', y='Annual Amount', hue='Property Type', data=filtered_rents_free_hold, palette='viridis')
plt.title("Annual Amount by Free Hold Status and Property Type")
plt.show()


In [None]:
# Free Hold
# Expanded Analysis: Adding Area
plt.figure(figsize=(12, 7))
sns.boxplot(x='Is Free Hold?', y='Annual Amount', hue='Area', data=filtered_rents_free_hold, palette='Set2')
plt.title("Annual Amount by Free Hold Status and Area")
plt.show()

# Expanded Analysis: Adding Property Sub Type
plt.figure(figsize=(12, 7))
sns.boxplot(x='Is Free Hold?', y='Annual Amount', hue='Property Sub Type', data=filtered_rents_free_hold, palette='Set3')
plt.title("Annual Amount by Free Hold Status and Property Sub Type")
plt.show()

# Expanded Analysis: Adding Usage
plt.figure(figsize=(12, 7))
sns.boxplot(x='Is Free Hold?', y='Annual Amount', hue='Usage', data=filtered_rents_free_hold, palette='tab10')
plt.title("Annual Amount by Free Hold Status and Usage")
plt.show()

In [None]:
# Size and Rooms
# Analyze the influence of Property Size (sq.m) and Number of Rooms



In [None]:
# Size and Rooms
# Analyze the influence of Property Size (sq.m) and Number of Rooms

# Scatter plot: Annual Amount vs Property Size (sq.m)
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=Rents, 
    hue='Usage',  # Add color based on the 'Usage' column
    alpha=0.6, 
    palette='viridis'
)
plt.title("Annual Amount vs Property Size (sq.m)")
plt.legend(title="Usage", bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend to avoid overlap
plt.show()




In [None]:

# Set a modern style
plt.figure(figsize=(8, 6))
sns.set_theme(style="whitegrid")

# Create the scatter plot with enhancements
ax = sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=Rents, 
    hue='Usage',  # Color points by 'Usage' category
    alpha=0.6, 
    palette='viridis', 
    edgecolor=None
)

# Use log scale for better visualization if needed
ax.set_yscale("log")

# Improve labels and title
plt.xlabel("Property Size (sq.m)", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Annual Amount vs Property Size (sq.m)", fontsize=14, fontweight="bold")

# Adjust legend position for better readability
plt.legend(title="Usage", bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.show()


In [None]:
# Box plot: Annual Amount by Number of Rooms (focus on top 10 most frequent)
top_room_counts = Rents['Number of Rooms'].value_counts().head(10).index
filtered_Rents_rooms = Rents[Rents['Number of Rooms'].isin(top_room_counts)]

# Ensure 'Number of Rooms' is treated as numeric (if it's not already)
filtered_Rents_rooms.loc[:, 'Number of Rooms'] = pd.to_numeric(filtered_Rents_rooms['Number of Rooms'], errors='coerce')

# Box plot
sns.boxplot(
    x='Number of Rooms', 
    y='Annual Amount', 
    data=filtered_Rents_rooms, 
    palette='coolwarm'
)
plt.title("Annual Amount by Top 10 Number of Rooms")
plt.xticks(rotation=45)
plt.show()


In [None]:

# Set a modern style
plt.figure(figsize=(8, 6))
sns.set_theme(style="whitegrid")

# Filter dataset to include only the top 10 most frequent room counts
top_room_counts = Rents['Number of Rooms'].value_counts().head(10).index
filtered_Rents_rooms = Rents[Rents['Number of Rooms'].isin(top_room_counts)]

# Ensure 'Number of Rooms' is treated as numeric
filtered_Rents_rooms.loc[:, 'Number of Rooms'] = pd.to_numeric(filtered_Rents_rooms['Number of Rooms'], errors='coerce')

# Create the box plot with enhancements
ax = sns.boxplot(
    x='Number of Rooms', 
    y='Annual Amount', 
    data=filtered_Rents_rooms, 
    palette='coolwarm', 
    width=0.6,
    showfliers=False  # Hide outliers for a cleaner look
)

# Use log scale for better visualization (optional)
ax.set_yscale("log")

# Improve labels and title
plt.xlabel("Number of Rooms", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Annual Amount by Top 10 Most Frequent Room Counts", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.show()


In [None]:
# Scatter plot: Annual Amount vs Property Size (sq.m) with 'Usage'
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=Rents, 
    hue='Usage',  
    alpha=0.6, 
    palette='viridis'
)
plt.title("Annual Amount vs Property Size (sq.m) by Usage")
plt.legend(title="Usage", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Scatter plot: Annual Amount vs Property Size (sq.m) with 'Property Type'
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=Rents, 
    hue='Property Type',  
    alpha=0.6, 
    palette='Set1'
)
plt.title("Annual Amount vs Property Size (sq.m) by Property Type")
plt.legend(title="Property Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Scatter plot: Annual Amount vs Property Size (sq.m) with 'Version'
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=Rents, 
    hue='Version',  
    alpha=0.6, 
    palette='cubehelix'
)
plt.title("Annual Amount vs Property Size (sq.m) by Version")
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Scatter plot: Annual Amount vs Property Size (sq.m) with 'Is Free Hold?'
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=Rents, 
    hue='Is Free Hold?',  
    alpha=0.6, 
    palette='Paired'
)
plt.title("Annual Amount vs Property Size (sq.m) by Free Hold Status")
plt.legend(title="Is Free Hold?", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Define plot settings
plot_params = [
    ("Usage", "viridis"),
    ("Property Type", "Set1"),
    ("Version", "cubehelix"),
    ("Is Free Hold?", "Paired")
]

# Create subplots for all scatter plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for ax, (hue_col, palette) in zip(axes, plot_params):
    sns.scatterplot(
        x='Property Size (sq.m)', 
        y='Annual Amount', 
        data=Rents, 
        hue=hue_col,  
        alpha=0.6, 
        palette=palette, 
        edgecolor=None,
        ax=ax
    )

    ax.set_yscale("log")  # Log scale for better distribution visualization
    ax.set_title(f"Annual Amount vs Property Size (sq.m) by {hue_col}", fontsize=13, fontweight="bold")
    ax.set_xlabel("Property Size (sq.m)", fontsize=11, fontweight="bold")
    ax.set_ylabel("Annual Amount (Log Scale)", fontsize=11, fontweight="bold")
    ax.legend(title=hue_col, bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()


In [None]:
# Filter for top 10 areas
top_areas = Rents['Area'].value_counts().head(10).index
filtered_rents_area = Rents[Rents['Area'].isin(top_areas)]

plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=filtered_rents_area, 
    hue='Area',  
    alpha=0.6, 
    palette='tab10'
)
plt.title("Annual Amount vs Property Size (sq.m) by Top 10 Areas")
plt.legend(title="Area", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 areas
top_areas = Rents['Area'].value_counts().head(10).index
filtered_rents_area = Rents[Rents['Area'].isin(top_areas)]

# Create scatter plot
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=filtered_rents_area, 
    hue='Area',  
    alpha=0.7, 
    palette='tab10',
    edgecolor=None
)

# Use log scale for better visualization
ax.set_yscale("log")

# Improve title and labels
plt.title("Annual Amount vs Property Size (sq.m) by Top 10 Areas", fontsize=14, fontweight="bold")
plt.xlabel("Property Size (sq.m)", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")

# Adjust legend placement
plt.legend(title="Area", bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.show()


In [None]:
# Filter for top 10 property sub types
top_property_sub_types = Rents['Property Sub Type'].value_counts().head(10).index
filtered_rents_property_sub_type = Rents[Rents['Property Sub Type'].isin(top_property_sub_types)]

plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Property Size (sq.m)', 
    y='Annual Amount', 
    data=filtered_rents_property_sub_type, 
    hue='Property Sub Type',  
    alpha=0.6, 
    palette='cool'
)
plt.title("Annual Amount vs Property Size (sq.m) by Top 10 Property Sub Types")
plt.legend(title="Property Sub Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:


# Sample the data to reduce the number of points plotted
sampled_rents = Rents.sample(n=10000, random_state=42)  # Adjust sample size as needed

# Convert 'Property Size (sq.m)' and 'Annual Amount' to numeric
sampled_rents['Property Size (sq.m)'] = pd.to_numeric(sampled_rents['Property Size (sq.m)'], errors='coerce')
sampled_rents['Annual Amount'] = pd.to_numeric(sampled_rents['Annual Amount'], errors='coerce')

# Drop rows with NaN values resulting from the conversion (if any)
sampled_rents = sampled_rents.dropna(subset=['Property Size (sq.m)', 'Annual Amount'])

# Log Transformation for better visualization
sampled_rents['Log Property Size'] = np.log1p(sampled_rents['Property Size (sq.m)'])
sampled_rents['Log Annual Amount'] = np.log1p(sampled_rents['Annual Amount'])

# Scatter plot: Annual Amount vs Property Size (sq.m) with log-transformed values
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Log Property Size', 
    y='Log Annual Amount', 
    data=sampled_rents, 
    hue='Usage',  # Add color based on the 'Usage' column
    alpha=0.6, 
    palette='viridis', 
    s=50  # Adjust marker size for better clarity
)
plt.title("Annual Amount vs Property Size (sq.m) - Log Transformed")
plt.legend(title="Usage", bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend to avoid overlap
plt.show()


In [None]:
'''
especially when dealing with a large dataset. By sampling, you reduce the number of points plotted, 
making the visualization more manageable while retaining a representative subset. 
Additionally, the log transformation is an excellent choice for visualizing skewed data like property size and annual amounts.
'''

In [None]:
# Sample the data to reduce the number of points plotted
sampled_rents = Rents.sample(n=10000, random_state=42)  # Adjust sample size as needed

# Convert necessary columns to numeric
sampled_rents['Property Size (sq.m)'] = pd.to_numeric(sampled_rents['Property Size (sq.m)'], errors='coerce')
sampled_rents['Annual Amount'] = pd.to_numeric(sampled_rents['Annual Amount'], errors='coerce')

# Drop rows with NaN values resulting from the conversion
sampled_rents = sampled_rents.dropna(subset=['Property Size (sq.m)', 'Annual Amount'])

# Log Transformation for better visualization
sampled_rents['Log Property Size'] = np.log1p(sampled_rents['Property Size (sq.m)'])
sampled_rents['Log Annual Amount'] = np.log1p(sampled_rents['Annual Amount'])

# Scatter plot: Annual Amount vs Property Size (sq.m) with log-transformed values and different hues
features_to_plot = ['Usage', 'Property Type', 'Area', 'Property Sub Type', 'Number of Rooms', 'Is Free Hold?']
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x='Log Property Size', 
        y='Log Annual Amount', 
        data=sampled_rents, 
        hue=feature,  
        alpha=0.6, 
        palette='viridis', 
        s=50
    )
    plt.title(f"Annual Amount vs Property Size (sq.m) - Log Transformed by {feature}")
    plt.legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()


In [None]:
# Proximity to Amenities
# Investigate the effect of Nearest Metro, Nearest Mall, and Nearest Landmark

# Boxplot for Nearest Metro
top_nearest_metro = Rents['Nearest Metro'].value_counts().head(10).index
filtered_Rents_metro = Rents[Rents['Nearest Metro'].isin(top_nearest_metro)]

sns.boxplot(
    x='Nearest Metro', 
    y='Annual Amount', 
    data=filtered_Rents_metro, 
    palette='coolwarm'
)
plt.title("Annual Amount by Top 10 Nearest Metro Stations")
plt.xticks(rotation=90)
plt.show()

In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 nearest metro stations
top_nearest_metro = Rents['Nearest Metro'].value_counts().head(10).index
filtered_Rents_metro = Rents[Rents['Nearest Metro'].isin(top_nearest_metro)]

# Create box plot
plt.figure(figsize=(12, 6))
ax = sns.boxplot(
    x='Nearest Metro', 
    y='Annual Amount', 
    data=filtered_Rents_metro, 
    palette='coolwarm', 
    width=0.6, 
    showfliers=False  # Hide outliers for cleaner visualization
)

# Use log scale for better visualization
ax.set_yscale("log")

# Improve labels and title
plt.xlabel("Nearest Metro Station", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Annual Amount by Top 10 Nearest Metro Stations", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Show plot
plt.show()


In [None]:
# Boxplot for Nearest Mall
top_nearest_mall = Rents['Nearest Mall'].value_counts().head(10).index
filtered_Rents_mall = Rents[Rents['Nearest Mall'].isin(top_nearest_mall)]

sns.boxplot(
    x='Nearest Mall', 
    y='Annual Amount', 
    data=filtered_Rents_mall, 
    palette='viridis'
)
plt.title("Annual Amount by Nearest Malls")
plt.xticks(rotation=45)
plt.show()

In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 nearest malls
top_nearest_mall = Rents['Nearest Mall'].value_counts().head(10).index
filtered_Rents_mall = Rents[Rents['Nearest Mall'].isin(top_nearest_mall)]

# Create box plot
plt.figure(figsize=(12, 6))
ax = sns.boxplot(
    x='Nearest Mall', 
    y='Annual Amount', 
    data=filtered_Rents_mall, 
    palette='viridis', 
    width=0.6, 
    showfliers=False  # Hide outliers for better visualization
)

# Use log scale for better visualization
ax.set_yscale("log")

# Improve labels and title
plt.xlabel("Nearest Mall", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Annual Amount by Top 10 Nearest Malls", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Show plot
plt.show()


In [None]:
# Boxplot for Nearest Landmark
top_nearest_landmark = Rents['Nearest Landmark'].value_counts().head(10).index
filtered_Rents_landmark = Rents[Rents['Nearest Landmark'].isin(top_nearest_landmark)]

sns.boxplot(
    x='Nearest Landmark', 
    y='Annual Amount', 
    data=filtered_Rents_landmark, 
    palette='magma'
)
plt.title("Annual Amount by Top 10 Nearest Landmarks")
plt.xticks(rotation=90)
plt.show()

In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 nearest landmarks
top_nearest_landmark = Rents['Nearest Landmark'].value_counts().head(10).index
filtered_Rents_landmark = Rents[Rents['Nearest Landmark'].isin(top_nearest_landmark)]

# Create box plot
plt.figure(figsize=(12, 6))
ax = sns.boxplot(
    x='Nearest Landmark', 
    y='Annual Amount', 
    data=filtered_Rents_landmark, 
    palette='magma', 
    width=0.6, 
    showfliers=False  # Hide outliers for cleaner visualization
)

# Use log scale for better visualization
ax.set_yscale("log")

# Improve labels and title
plt.xlabel("Nearest Landmark", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Annual Amount by Top 10 Nearest Landmarks", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Show plot
plt.show()


In [None]:
# Sample the data to reduce the number of points plotted
sampled_rents = Rents.sample(n=10000, random_state=42)  # Adjust sample size as needed

# Convert necessary columns to numeric
sampled_rents['Property Size (sq.m)'] = pd.to_numeric(sampled_rents['Property Size (sq.m)'], errors='coerce')
sampled_rents['Annual Amount'] = pd.to_numeric(sampled_rents['Annual Amount'], errors='coerce')

# Drop rows with NaN values resulting from the conversion
sampled_rents = sampled_rents.dropna(subset=['Property Size (sq.m)', 'Annual Amount'])

# Log Transformation for better visualization
sampled_rents['Log Property Size'] = np.log1p(sampled_rents['Property Size (sq.m)'])
sampled_rents['Log Annual Amount'] = np.log1p(sampled_rents['Annual Amount'])

# Scatter plot: Annual Amount vs Property Size (sq.m) with log-transformed values and different hues
features_to_plot = ['Nearest Metro', 'Nearest Mall', 'Nearest Landmark', 'Parking', 'No of Units']
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x='Log Property Size', 
        y='Log Annual Amount', 
        data=sampled_rents, 
        hue=feature,  
        alpha=0.6, 
        palette='viridis', 
        s=50
    )
    plt.title(f"Annual Amount vs Property Size (sq.m) - Log Transformed by {feature}")
    plt.legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()


In [None]:
# Proximity to Amenities
# Investigate the effect of Nearest Metro, Nearest Mall, and Nearest Landmark

# Boxplot for Nearest Metro
top_nearest_metro = Rents['Nearest Metro'].value_counts().head(10).index
filtered_Rents_metro = Rents[Rents['Nearest Metro'].isin(top_nearest_metro)]

sns.boxplot(
    x='Nearest Metro', 
    y='Annual Amount', 
    data=filtered_Rents_metro, 
    palette='coolwarm'
)
plt.title("Annual Amount by Top 10 Nearest Metro Stations")
plt.xticks(rotation=90)
plt.show()

# Boxplot for Nearest Mall
top_nearest_mall = Rents['Nearest Mall'].value_counts().head(10).index
filtered_Rents_mall = Rents[Rents['Nearest Mall'].isin(top_nearest_mall)]

sns.boxplot(
    x='Nearest Mall', 
    y='Annual Amount', 
    data=filtered_Rents_mall, 
    palette='coolwarm'
)
plt.title("Annual Amount by Top 10 Nearest Malls")
plt.xticks(rotation=90)
plt.show()

# Boxplot for Nearest Landmark
top_nearest_landmark = Rents['Nearest Landmark'].value_counts().head(10).index
filtered_Rents_landmark = Rents[Rents['Nearest Landmark'].isin(top_nearest_landmark)]

sns.boxplot(
    x='Nearest Landmark', 
    y='Annual Amount', 
    data=filtered_Rents_landmark, 
    palette='coolwarm'
)
plt.title("Annual Amount by Top 10 Nearest Landmarks")
plt.xticks(rotation=90)
plt.show()


In [None]:
Rents['Parking'].unique()

In [None]:
# Parking and Units
# Examine the role of Parking and No of Units in pricing

# Scatterplot for Parking
top_parking_values = Rents['Parking'].value_counts().head(10).index
filtered_Rents_parking = Rents[Rents['Parking'].isin(top_parking_values)]

sns.scatterplot(
    x='Parking', 
    y='Annual Amount', 
    data=filtered_Rents_parking, 
    hue='Parking', 
    palette='plasma', 
    alpha=0.7
)
plt.title("Annual Amount vs Top 10 Parking Spaces")
plt.legend(title="Parking Spaces", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 parking values
top_parking_values = Rents['Parking'].value_counts().head(10).index
filtered_Rents_parking = Rents[Rents['Parking'].isin(top_parking_values)]

# Create scatter plot
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    x='Parking', 
    y='Annual Amount', 
    data=filtered_Rents_parking, 
    hue='Parking', 
    palette='plasma', 
    alpha=0.7, 
    edgecolor=None
)

# Improve labels and title
plt.xlabel("Parking Spaces", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount", fontsize=12, fontweight="bold")
plt.title("Annual Amount vs Top 10 Parking Spaces", fontsize=14, fontweight="bold")

# Adjust legend placement
plt.legend(title="Parking Spaces", bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.show()


In [None]:
Rents['No of Units'].unique()

In [None]:
# Boxplot for Number of Units
top_units_values = Rents['No of Units'].value_counts().head(10).index
filtered_Rents_units = Rents[Rents['No of Units'].isin(top_units_values)]

sns.boxplot(
    x='No of Units', 
    y='Annual Amount', 
    data=filtered_Rents_units, 
    hue = 'No of Units', 
    palette='cubehelix'
)
plt.title("Annual Amount by Top 10 Number of Units")
plt.xticks(rotation=45)
plt.show()

In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 number of units
top_units_values = Rents['No of Units'].value_counts().head(10).index
filtered_Rents_units = Rents[Rents['No of Units'].isin(top_units_values)]

# Create box plot
plt.figure(figsize=(12, 6))
ax = sns.boxplot(
    x='No of Units', 
    y='Annual Amount', 
    data=filtered_Rents_units, 
    hue='No of Units', 
    palette='cubehelix', 
    width=0.6, 
    showfliers=False  # Hide outliers for cleaner visualization
)

# Use log scale for better visualization
ax.set_yscale("log")

# Improve labels and title
plt.xlabel("Number of Units", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount (Log Scale)", fontsize=12, fontweight="bold")
plt.title("Annual Amount by Top 10 Number of Units", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Show plot
plt.show()


In [None]:
Rents['Master Project'].unique()

In [None]:
Rents['Project'].unique()

In [None]:
# Most frequent top 10 Projects
top_project_values = Rents['Project'].value_counts().head(10).index
filtered_Rents_project = Rents[Rents['Project'].isin(top_project_values)]

sns.boxplot(
    x='Project', 
    y='Annual Amount', 
    data=filtered_Rents_project, 
    palette='Set2'
)
plt.title("Annual Amount by Top 10 Projects")
plt.xticks(rotation=90)
plt.show()


In [None]:

# Set theme for consistency
sns.set_theme(style="whitegrid")

# Filter for top 10 projects
top_project_values = Rents['Project'].value_counts().head(10).index
filtered_Rents_project = Rents[Rents['Project'].isin(top_project_values)]

# Create box plot
plt.figure(figsize=(12, 6))
sns.boxplot(
    x='Project', 
    y='Annual Amount', 
    data=filtered_Rents_project, 
    palette='Set2', 
    width=0.6
)

# Improve labels and title
plt.xlabel("Project", fontsize=12, fontweight="bold")
plt.ylabel("Annual Amount", fontsize=12, fontweight="bold")
plt.title("Annual Amount by Top 10 Projects", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=90, ha="right")

# Show plot
plt.show()


In [None]:
# Most frequent top 10 Master Projects
top_master_project_values = Rents['Master Project'].value_counts().head(10).index
filtered_Rents_master_project = Rents[Rents['Master Project'].isin(top_master_project_values)]

sns.boxplot(
    x='Master Project', 
    y='Annual Amount', 
    data=filtered_Rents_master_project, 
    palette='Set2'
)
plt.title("Annual Amount by Top 10 Master Projects")
plt.xticks(rotation=45)
plt.show()


In [None]:

# Ensure numeric columns are properly converted
numeric_columns = ['Contract Amount', 'Annual Amount', 'Property Size (sq.m)']
Rents[numeric_columns] = Rents[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Handle NaN values by dropping rows with missing values in numeric columns
Rents = Rents.dropna(subset=numeric_columns)

# Add log-transformed columns for skewed data
for col in numeric_columns:
    Rents[f'Log {col}'] = np.log1p(Rents[col])

# Plot histograms for numeric and log-transformed columns
all_columns = numeric_columns + [f'Log {col}' for col in numeric_columns]
for col in all_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(Rents[col], kde=True, bins=30, color='skyblue', edgecolor='black')
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

# Combined histograms for numeric columns
plt.figure(figsize=(10, 6))
for col, color in zip(numeric_columns, ['red', 'green', 'blue']):
    sns.histplot(Rents[col], kde=True, bins=30, color=color, label=col, alpha=0.5)
plt.title("Combined Distribution of Numeric Columns")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.legend(title="Columns")
plt.show()

# Combined histograms for log-transformed columns
plt.figure(figsize=(10, 6))
for col, color in zip([f'Log {col}' for col in numeric_columns], ['purple', 'orange', 'cyan']):
    sns.histplot(Rents[col], kde=True, bins=30, color=color, label=col, alpha=0.5)
plt.title("Combined Distribution of Log-Transformed Columns")
plt.xlabel("Log Value")
plt.ylabel("Frequency")
plt.legend(title="Columns")
plt.show()


In [None]:
# Bar plots for categorical columns
categorical_columns = ['Property Sub Type', 'Usage', 'Is Free Hold?']
for col in categorical_columns:
    plt.figure(figsize=(8, 5))
    Rents[col].value_counts().head(10).plot(kind='bar')
    plt.title(f"Top 10 Categories in {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


In [None]:
# Ensure 'Registration Date' is in datetime format
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')

# Extract month and week from the registration date
Rents['Registration Month'] = Rents['Registration Date'].dt.to_period('M')
Rents['Registration Week'] = Rents['Registration Date'].dt.to_period('W')

# Average price for the previous month
Rents['Prev Month Avg Price'] = (
    Rents.groupby(['Property Type', 'Registration Month'])['Contract Amount']
    .transform(lambda x: x.shift(1))
)

# Average price for the previous week
Rents['Prev Week Avg Price'] = (
    Rents.groupby(['Property Type', 'Registration Week'])['Contract Amount']
    .transform(lambda x: x.shift(1))
)

# Handle missing values created by the shift
Rents['Prev Month Avg Price'].fillna(0, inplace=True)
Rents['Prev Week Avg Price'].fillna(0, inplace=True)

# Check the results
print(Rents[['Registration Date', 'Property Type', 'Contract Amount', 
             'Prev Month Avg Price', 'Prev Week Avg Price']].head())


In [None]:
# Scatter plot for Contract Amount vs. Previous Month Avg Price
sns.scatterplot(
    x='Prev Month Avg Price',
    y='Contract Amount',
    hue='Property Type',
    data=Rents,
    alpha=0.7,
    palette='viridis'
)
plt.title("Contract Amount vs. Previous Month Avg Price by Property Type")
plt.xlabel("Previous Month Avg Price")
plt.ylabel("Contract Amount")
plt.legend(title="Property Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Histogram for Previous Week Avg Price
sns.histplot(Rents['Prev Week Avg Price'], kde=True, bins=30, color='blue')
plt.title("Distribution of Previous Week Avg Price")
plt.xlabel("Previous Week Avg Price")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Correlation for Numeric Features

In [None]:
# Check for non-numeric values in other columns
problematic_columns = ['Property Size (sq.m)', 'Number of Rooms', 'Parking', 'No of Units']

for col in problematic_columns:
    print(f"Non-numeric values in {col}:")
    print(Rents[~Rents[col].apply(pd.to_numeric, errors='coerce').notna()][col].unique())
    print("-" * 40)


In [None]:
'''
# Clean 'Property Size (sq.m)' and 'Parking' columns
Rents['Property Size (sq.m)'] = Rents['Property Size (sq.m)'].replace({r'[^\d.]': ''}, regex=True)
Rents['Parking'] = Rents['Parking'].replace({r'[^\d.]': ''}, regex=True)

# Clean 'No of Units' if necessary (for example, remove any text if present)
Rents['No of Units'] = Rents['No of Units'].replace({r'[^\d]': ''}, regex=True)

# Convert these columns to numeric (this will set any invalid values to NaN)
Rents['Property Size (sq.m)'] = pd.to_numeric(Rents['Property Size (sq.m)'], errors='coerce')
Rents['Parking'] = pd.to_numeric(Rents['Parking'], errors='coerce')
Rents['No of Units'] = pd.to_numeric(Rents['No of Units'], errors='coerce')

# Verify the data
print(Rents[['Property Size (sq.m)', 'Parking', 'No of Units']].head())
'''

In [None]:
# Correlation matrix for numeric columns
numeric_columns = ['Annual Amount', 'Property Size (sq.m)', 'Number of Rooms', 'Parking', 'No of Units']
correlation_matrix = Rents[numeric_columns].corr()

# Visualize correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Numeric Features")
plt.show()

# Scatter plots for key relationships
sampled_Rents = Rents.sample(5000, random_state=42)  # Sample 5000 rows to make the plot more readable

for col in ['Property Size (sq.m)', 'Number of Rooms', 'Parking', 'No of Units']:
    sns.scatterplot(x=sampled_Rents[col], y=sampled_Rents['Annual Amount'])
    plt.title(f"{col} vs Annual Amount")
    plt.show()


In [None]:
correlation_matrix

In [None]:
# Correlation matrix for numeric columns
numeric_columns = ['Annual Amount', 'Property Size (sq.m)', 'Number of Rooms', 'Parking', 'No of Units']
correlation_matrix = Rents[numeric_columns].corr()

# Visualize correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Numeric Features")
plt.show()

# Scatter plots for key relationships
sampled_Rents = Rents.sample(5000, random_state=42)  # Sample 5000 rows to make the plot more readable

for col in ['Property Size (sq.m)', 'Number of Rooms', 'Parking', 'No of Units']:
    sns.scatterplot(x=sampled_Rents[col], y=sampled_Rents['Annual Amount'])
    plt.title(f"{col} vs Annual Amount")
    plt.show()


In [None]:
# Ensure 'Registration Date' is in datetime format
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')

# Extract month and week from the registration date
Rents['Registration Month'] = Rents['Registration Date'].dt.to_period('M')
Rents['Registration Week'] = Rents['Registration Date'].dt.to_period('W')

# Define grouping columns
grouping_columns = ['Property Type', 'Usage', 'Area', 'Property Sub Type', 'Number of Rooms', 'Is Free Hold?']

# Average price for the previous month
Rents['Prev Month Avg Price'] = (
    Rents.groupby(grouping_columns + ['Registration Month'])['Contract Amount']
    .transform(lambda x: x.shift(1))
)

# Average price for the previous week
Rents['Prev Week Avg Price'] = (
    Rents.groupby(grouping_columns + ['Registration Week'])['Contract Amount']
    .transform(lambda x: x.shift(1))
)

# Handle missing values created by the shift
Rents['Prev Month Avg Price'].fillna(0, inplace=True)
Rents['Prev Week Avg Price'].fillna(0, inplace=True)

# Check the results
print(Rents[['Registration Date', 'Property Type', 'Usage', 'Area', 'Property Sub Type', 
             'Number of Rooms', 'Is Free Hold?', 'Contract Amount', 
             'Prev Month Avg Price', 'Prev Week Avg Price']].head())


In [None]:
# Scatter plot for Contract Amount vs. Previous Week Avg Price
plt.figure(figsize=(12, 6))
sns.scatterplot(
    x='Prev Week Avg Price',
    y='Contract Amount',
    hue='Property Type',
    style='Usage',
    size='Number of Rooms',
    data=Rents,
    alpha=0.7,
    palette='viridis'
)
plt.title("Contract Amount vs. Previous Week Avg Price by Property Characteristics")
plt.xlabel("Previous Week Avg Price")
plt.ylabel("Contract Amount")
plt.legend(title="Property Characteristics", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Scatter plot: Contract Amount vs Previous Week Avg Price by Property Type
plt.figure(figsize=(12, 6))
sns.scatterplot(
    x='Prev Week Avg Price', 
    y='Contract Amount', 
    hue='Property Type', 
    data=Rents, 
    alpha=0.7, 
    palette='Set1', 
    s=100  # Adjust marker size for better visibility
)
plt.title("Contract Amount vs Previous Week Avg Price by Property Type")
plt.xlabel("Previous Week Avg Price")
plt.ylabel("Contract Amount")
plt.legend(title="Property Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

# Scatter plot: Contract Amount vs Previous Week Avg Price by Usage
plt.figure(figsize=(12, 6))
sns.scatterplot(
    x='Prev Week Avg Price', 
    y='Contract Amount', 
    hue='Usage', 
    data=Rents, 
    alpha=0.7, 
    palette='coolwarm', 
    s=100
)
plt.title("Contract Amount vs Previous Week Avg Price by Usage")
plt.xlabel("Previous Week Avg Price")
plt.ylabel("Contract Amount")
plt.legend(title="Usage", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

# Scatter plot: Contract Amount vs Previous Week Avg Price by Number of Rooms
plt.figure(figsize=(12, 6))
sns.scatterplot(
    x='Prev Week Avg Price', 
    y='Contract Amount', 
    size='Number of Rooms', 
    data=Rents, 
    sizes=(20, 200),  # Adjust the range of marker sizes
    alpha=0.7, 
    palette='Blues'
)
plt.title("Contract Amount vs Previous Week Avg Price by Number of Rooms")
plt.xlabel("Previous Week Avg Price")
plt.ylabel("Contract Amount")
plt.legend(title="Number of Rooms", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()


In [None]:
# Histogram for Previous Month Avg Price by Free Hold Status
plt.figure(figsize=(10, 5))
sns.histplot(
    data=Rents, 
    x='Prev Month Avg Price', 
    hue='Is Free Hold?', 
    kde=True, 
    bins=30, 
    palette='muted'
)
plt.title("Distribution of Previous Month Avg Price by Free Hold Status")
plt.xlabel("Previous Month Avg Price")
plt.ylabel("Frequency")
plt.legend(title="Is Free Hold?")
plt.show()


In [None]:
# Previous Month Avg Price by Usage

plt.figure(figsize=(12, 7))
sns.boxplot(
    x='Usage', 
    y='Prev Month Avg Price', 
    data=Rents, 
    palette='coolwarm'
)
plt.title("Previous Month Avg Price by Usage")
plt.xticks(rotation=45)
plt.xlabel("Usage")
plt.ylabel("Previous Month Avg Price")
plt.show()


In [None]:
# Filter for top 10 areas by frequency
top_areas = Rents['Area'].value_counts().head(10).index
filtered_rents_area = Rents[Rents['Area'].isin(top_areas)]

plt.figure(figsize=(12, 7))
sns.boxplot(
    x='Area', 
    y='Prev Month Avg Price', 
    data=filtered_rents_area, 
    palette='viridis'
)
plt.title("Previous Month Avg Price by Top 10 Areas")
plt.xticks(rotation=45)
plt.xlabel("Area")
plt.ylabel("Previous Month Avg Price")
plt.show()


In [None]:
# Filter for top 10 property sub types by frequency
top_property_sub_types = Rents['Property Sub Type'].value_counts().head(10).index
filtered_rents_property_sub_type = Rents[Rents['Property Sub Type'].isin(top_property_sub_types)]

plt.figure(figsize=(12, 7))
sns.boxplot(
    x='Property Sub Type', 
    y='Prev Month Avg Price', 
    data=filtered_rents_property_sub_type, 
    palette='plasma'
)
plt.title("Previous Month Avg Price by Top 10 Property Sub Types")
plt.xticks(rotation=45)
plt.xlabel("Property Sub Type")
plt.ylabel("Previous Month Avg Price")
plt.show()


In [None]:
#Previous Month Avg Price by Number of Rooms

plt.figure(figsize=(12, 7))
sns.boxplot(
    x='Number of Rooms', 
    y='Prev Month Avg Price', 
    data=Rents, 
    palette='cubehelix'
)
plt.title("Previous Month Avg Price by Number of Rooms")
plt.xlabel("Number of Rooms")
plt.ylabel("Previous Month Avg Price")
plt.show()


In [None]:
# Previous Month Avg Price by Free Hold Status

plt.figure(figsize=(12, 7))
sns.boxplot(
    x='Is Free Hold?', 
    y='Prev Month Avg Price', 
    data=Rents, 
    palette='viridis'
)
plt.title("Previous Month Avg Price by Free Hold Status")
plt.xlabel("Is Free Hold?")
plt.ylabel("Previous Month Avg Price")
plt.show()


In [None]:
Rents['Property Type'].unique()

In [None]:
Rents['Property Sub Type'].unique()

In [None]:
Rents['Usage'].unique()

In [None]:
Rents['Is Free Hold?'].unique()

In [None]:
Rents['Area'].unique()

In [None]:
Rents['Number of Rooms'].unique()

In [None]:
Rents['Version'].unique()

In [None]:
Rents['Project'].unique()

In [None]:
Rents['Master Project'].unique()

In [None]:
Rents['No of Units'].unique()

In [None]:
Rents['Parking'].unique()

In [None]:
# Box plots for categorical features
categorical_columns = ['Property Type', 'Property Sub Type', 'Usage', 'Is Free Hold?', 'Area']
for col in categorical_columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=Rents[col], y=Rents['Annual Amount'])
    plt.title(f"{col} vs Annual Amount")
    plt.xticks(rotation=90)
    plt.show()


In [None]:

# Ensure numeric columns are properly converted
Rents['Contract Amount'] = pd.to_numeric(Rents['Contract Amount'], errors='coerce')
Rents['Annual Amount'] = pd.to_numeric(Rents['Annual Amount'], errors='coerce')
Rents['Property Size (sq.m)'] = pd.to_numeric(Rents['Property Size (sq.m)'], errors='coerce')

# Remove rows with NaN values in these columns
Rents_cleaned = Rents.dropna(subset=['Contract Amount', 'Annual Amount', 'Property Size (sq.m)'])

# Optional: Apply log transformation if the distribution is skewed
Rents_cleaned['Contract Amount'] = np.log1p(Rents_cleaned['Contract Amount'])
Rents_cleaned['Annual Amount'] = np.log1p(Rents_cleaned['Annual Amount'])
Rents_cleaned['Property Size (sq.m)'] = np.log1p(Rents_cleaned['Property Size (sq.m)'])

# Plot histograms for numeric columns
numeric_columns = ['Contract Amount', 'Annual Amount', 'Property Size (sq.m)']
for col in numeric_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(Rents_cleaned[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:

# Ensure necessary columns are cleaned
Rents['Area'] = Rents['Area'].fillna('Unknown')  # Replace NaN areas with 'Unknown'
Rents['Annual Amount'] = pd.to_numeric(Rents['Annual Amount'], errors='coerce')

# Remove rows with NaN in 'Annual Amount' (optional based on context)
Rents_cleaned = Rents.dropna(subset=['Annual Amount'])

# Average Annual Amount by Area
area_annual_avg = Rents_cleaned.groupby('Area')['Annual Amount'].mean().sort_values(ascending=False)

# Optional: Apply log transformation if needed for skewed data
# area_annual_avg = np.log1p(area_annual_avg)

# Limit to top 10 areas to avoid overcrowded plots
top_area_annual_avg = area_annual_avg.head(10)

# Plot the data
plt.figure(figsize=(12, 6))
top_area_annual_avg.plot(kind='bar', color='skyblue')
plt.title("Top 10 Average Annual Amount by Area")
plt.xlabel("Area")
plt.ylabel("Average Annual Amount")
plt.xticks(rotation=45, ha="right")
plt.show()


In [None]:
# Spatial Analysis
# Average Annual Amount by Area
area_annual_avg = Rents.groupby('Area')['Annual Amount'].mean().sort_values(ascending=False)
area_annual_avg.plot(kind='bar', figsize=(12, 6))
plt.title("Average Annual Amount by Area")
plt.xlabel("Area")
plt.ylabel("Average Annual Amount")
plt.xticks(rotation=90)
plt.show()


In [None]:
Rents["Property ID"].info()

In [None]:
Rents["Property ID"].value_counts()

In [None]:
# Get the top 10 most frequent Areas
top_areas = Rents['Area'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 areas
filtered_rents = Rents[Rents['Area'].isin(top_areas)]

# Grouping by Area and Version to see the impact of Area and Version on Prev Month Avg Price
area_version_avg = filtered_rents.groupby(['Area', 'Version'])['Prev Month Avg Price'].mean().unstack().fillna(0)

# Plot the Average Prev Month Avg Price by Area and Version (Top 10 Areas)
area_version_avg.plot(kind='bar', figsize=(12, 6), stacked=False, colormap='Set1')
plt.title("Average Prev Month Avg Price by Area and Version (Top 10 Areas)")
plt.ylabel("Average Prev Month Avg Price")
plt.xticks(rotation=45)
plt.xlabel("Area")
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Box Plot for Prev Month Avg Price Distribution by Area and Version (Top 10 Areas)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Prev Month Avg Price', hue='Version', data=filtered_rents, palette='muted')
plt.title("Prev Month Avg Price Distribution by Area and Version (Top 10 Areas)")
plt.xticks(rotation=45)
plt.show()

# Calculate Percentage Change in Average Prev Month Avg Price between Versions for Top 10 Areas
area_version_change = area_version_avg.pct_change(axis='columns') * 100

# Plot Percentage Change in Prev Month Avg Price for Top 10 Areas and Versions
area_version_change.plot(kind='bar', figsize=(12, 6), colormap='coolwarm')
plt.title("Percentage Change in Average Prev Month Avg Price by Area and Version (Top 10 Areas)")
plt.ylabel("Percentage Change (%)")
plt.xlabel("Area")
plt.xticks(rotation=45)
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Average Prev Week Avg Price

# Get the top 10 most frequent Areas
top_areas = Rents['Area'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 areas
filtered_rents = Rents[Rents['Area'].isin(top_areas)]

# Grouping by Area and Version to see the impact of Area and Version on Prev Week Avg Price
area_version_avg = filtered_rents.groupby(['Area', 'Version'])['Prev Week Avg Price'].mean().unstack().fillna(0)

# Plot the Average Prev Week Avg Price by Area and Version (Top 10 Areas)
area_version_avg.plot(kind='bar', figsize=(12, 6), stacked=False, colormap='Set1')
plt.title("Average Prev Week Avg Price by Area and Version (Top 10 Areas)")
plt.ylabel("Average Prev Week Avg Price")
plt.xticks(rotation=45)
plt.xlabel("Area")
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Box Plot for Prev Week Avg Price Distribution by Area and Version (Top 10 Areas)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Prev Week Avg Price', hue='Version', data=filtered_rents, palette='muted')
plt.title("Prev Week Avg Price Distribution by Area and Version (Top 10 Areas)")
plt.xticks(rotation=45)
plt.show()

# Calculate Percentage Change in Average Prev Week Avg Price between Versions for Top 10 Areas
area_version_change = area_version_avg.pct_change(axis='columns') * 100

# Plot Percentage Change in Prev Week Avg Price for Top 10 Areas and Versions
area_version_change.plot(kind='bar', figsize=(12, 6), colormap='coolwarm')
plt.title("Percentage Change in Average Prev Week Avg Price by Area and Version (Top 10 Areas)")
plt.ylabel("Percentage Change (%)")
plt.xlabel("Area")
plt.xticks(rotation=45)
plt.legend(title="Version", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Time series analysis for Rents.csv

# Convert 'Transaction Date' to datetime if not already
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')

# Group by month and year to analyze price trends for 'Annual Amount'
monthly_avg_annual_amount = Rents.resample('M', on='Registration Date')['Annual Amount'].mean()

# Plot the monthly average annual amount trend
plt.figure(figsize=(12, 6))
monthly_avg_annual_amount.plot()
plt.title("Monthly Average Annual Amount")
plt.ylabel("Average Annual Amount")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.show()

# Group by month and year to analyze price trends for 'Contract Amount'
monthly_avg_contract_amount = Rents.resample('M', on='Registration Date')['Contract Amount'].mean()

# Plot the monthly average contract amount trend
plt.figure(figsize=(12, 6))
monthly_avg_contract_amount.plot()
plt.title("Monthly Average Contract Amount")
plt.ylabel("Average Contract Amount")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.show()

# Group by month and year to analyze price trends for 'Prev Month Avg Price'
monthly_avg_prev_month_avg_price = Rents.resample('M', on='Registration Date')['Prev Month Avg Price'].mean()

# Plot the monthly average previous month average price trend
plt.figure(figsize=(12, 6))
monthly_avg_prev_month_avg_price.plot()
plt.title("Monthly Average Prev Month Avg Price")
plt.ylabel("Average Prev Month Avg Price")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.show()

# Group by month and year to analyze price trends for 'Prev Week Avg Price'
monthly_avg_prev_week_avg_price = Rents.resample('M', on='Registration Date')['Prev Week Avg Price'].mean()

# Plot the monthly average previous week average price trend
plt.figure(figsize=(12, 6))
monthly_avg_prev_week_avg_price.plot()
plt.title("Monthly Average Prev Week Avg Price")
plt.ylabel("Average Prev Week Avg Price")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Time series analysis for Rents.csv

# Convert 'Registration Date' to datetime if not already
Rents['Registration Date'] = pd.to_datetime(Rents['Registration Date'], errors='coerce')

# Handle missing values (optional, depending on your approach)
Rents.dropna(subset=['Registration Date', 'Annual Amount', 'Contract Amount', 'Prev Month Avg Price', 'Prev Week Avg Price'], inplace=True)

# Group by month and year to analyze price trends for 'Annual Amount'
monthly_avg_annual_amount = Rents.resample('M', on='Registration Date')['Annual Amount'].mean()

# Group by month and year to analyze price trends for 'Contract Amount'
monthly_avg_contract_amount = Rents.resample('M', on='Registration Date')['Contract Amount'].mean()

# Group by month and year to analyze price trends for 'Prev Month Avg Price'
monthly_avg_prev_month_avg_price = Rents.resample('M', on='Registration Date')['Prev Month Avg Price'].mean()

# Group by month and year to analyze price trends for 'Prev Week Avg Price'
monthly_avg_prev_week_avg_price = Rents.resample('M', on='Registration Date')['Prev Week Avg Price'].mean()

# Plotting all four trends in one figure using subplots for better comparison
plt.figure(figsize=(14, 10))

# Plot: Monthly Average Annual Amount
plt.subplot(2, 2, 1)
monthly_avg_annual_amount.plot()
plt.title("Monthly Average Annual Amount")
plt.ylabel("Average Annual Amount")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.grid(True)

# Plot: Monthly Average Contract Amount
plt.subplot(2, 2, 2)
monthly_avg_contract_amount.plot()
plt.title("Monthly Average Contract Amount")
plt.ylabel("Average Contract Amount")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.grid(True)

# Plot: Monthly Average Previous Month Avg Price
plt.subplot(2, 2, 3)
monthly_avg_prev_month_avg_price.plot()
plt.title("Monthly Average Prev Month Avg Price")
plt.ylabel("Average Prev Month Avg Price")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.grid(True)

# Plot: Monthly Average Previous Week Avg Price
plt.subplot(2, 2, 4)
monthly_avg_prev_week_avg_price.plot()
plt.title("Monthly Average Prev Week Avg Price")
plt.ylabel("Average Prev Week Avg Price")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.grid(True)

plt.tight_layout()  # Adjust layout to avoid overlapping
plt.show()


In [None]:
############################################################

In [None]:
Transactions.head()

In [None]:
Transactions.info()

In [None]:
Transactions['Room(s)'].unique()

In [None]:
Transactions['Parking'].unique()

In [None]:

# Convert 'Transaction Date' to datetime
Transactions['Transaction Date'] = pd.to_datetime(Transactions['Transaction Date'], errors='coerce')

# Convert 'Property Size (sq.m)' and other numeric columns to numeric, forcing errors to NaN
Transactions['Property Size (sq.m)'] = pd.to_numeric(Transactions['Property Size (sq.m)'], errors='coerce')


In [None]:

# Check for missing values
missing_values = Transactions.isnull().sum()

# Display basic info and missing values
print(Transactions.info())
print(missing_values)


In [None]:
Transactions['Room(s)'].unique()

In [None]:
Transactions['Transaction Type'].unique()

In [None]:
# Plot distribution of 'Transaction Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Transaction Type', data=Transactions)
plt.title("Transaction Type Distribution")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
Transactions['Transaction sub type'].unique()

In [None]:
# Get the top 10 most frequent Transaction Sub Types
top_transaction_sub_types = Transactions['Transaction sub type'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 Transaction Sub Types
filtered_transactions = Transactions[Transactions['Transaction sub type'].isin(top_transaction_sub_types)]

# Plot distribution of 'Transaction Sub Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Transaction sub type', data=filtered_transactions)
plt.title("Transaction Sub Type Distribution (Top 10)")
plt.xlabel("Transaction Sub Type")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()


In [None]:
Transactions['Registration type'].unique()

In [None]:
# Plot distribution of 'Registration Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Registration type', data=Transactions)
plt.title("Registration Type Distribution")
plt.xlabel("Registration Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
Transactions['Usage'].unique()

In [None]:
# Plot distribution of 'Usage'
plt.figure(figsize=(8, 5))
sns.countplot(x='Usage', data=Transactions)
plt.title("Usage")
plt.xlabel("Usage")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
Transactions['Is Free Hold?'].unique()

In [None]:
# Plot distribution of 'Is Free Hold?'
plt.figure(figsize=(8, 5))
sns.countplot(x='Is Free Hold?', data=Transactions)
plt.title("Is Free Hold?")
plt.xlabel("Is Free Hold?")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
Transactions['Area'].unique()

In [None]:
# Get the top 10 most frequent Area Types
top_area_types = Transactions['Area'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 Transaction Sub Types
filtered_areas = Transactions[Transactions['Area'].isin(top_area_types)]

# Plot distribution of 'Transaction Sub Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Area', data=filtered_areas)
plt.title("Area Type Distribution (Top 10)")
plt.xlabel("Area")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

In [None]:
Transactions['Property Type'].unique()

In [None]:

# Plot distribution of 'Property Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Property Type', data=Transactions)
plt.title("Property Type Distribution")
plt.xlabel("Property Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
Transactions['Property Sub Type'].unique()

In [None]:
# Get the top 10 most frequent Property Sub Types
top_property_sub_types = Transactions['Property Sub Type'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 Property Sub Types
filtered_property_sub_types = Transactions[Transactions['Property Sub Type'].isin(top_property_sub_types)]

# Plot distribution of 'Property Sub Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Property Sub Type', data=filtered_property_sub_types)
plt.title("Property Sub Type Distribution (Top 10)")
plt.xlabel("Property Sub Type")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()


In [None]:
# Get the top 10 most frequent Transaction Sub Types
top_transaction_subtypes = Transactions['Transaction sub type'].value_counts().nlargest(10).index

# Filter the data to include only the top Transaction Sub Types
filtered_transactions = Transactions[Transactions['Transaction sub type'].isin(top_transaction_subtypes)]

# Group by 'Transaction Type' and 'Transaction Sub Type' to get the sum of 'Amount'
transaction_type_subtype_amount = (
    filtered_transactions.groupby(['Transaction Type', 'Transaction sub type'])['Amount']
    .sum().reset_index()
)

# Plot the result with a fancier appearance
plt.figure(figsize=(14, 8))

sns.barplot(
    x='Transaction sub type', 
    y='Amount', 
    hue='Transaction Type', 
    data=transaction_type_subtype_amount, 
    palette='muted'
)

plt.title("Total Transaction Amount by Type and Sub-Type (Top 10)", fontsize=16)
plt.xlabel("Transaction Sub Type", fontsize=12)
plt.ylabel("Total Amount", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.legend(title="Transaction Type", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)

# Add gridlines for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()  # Ensure the plot fits without overlap
plt.show()


In [None]:
# Group by 'Transaction Type' and calculate the average 'Amount'
transaction_type_avg_amount = Transactions.groupby('Transaction Type')['Amount'].mean().reset_index()

# Plot the result
plt.figure(figsize=(10, 6))
sns.barplot(x='Transaction Type', y='Amount', data=transaction_type_avg_amount)
plt.title("Average Transaction Amount by Transaction Type")
plt.xlabel("Transaction Type")
plt.ylabel("Average Amount")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Group by 'Transaction Type' and calculate the average 'Amount'
transaction_type_avg_amount = Transactions.groupby('Transaction Type')['Amount'].mean().reset_index()

# Plot the result with enhanced styling
plt.figure(figsize=(10, 6))

sns.barplot(
    x='Transaction Type', 
    y='Amount', 
    data=transaction_type_avg_amount, 
    palette='Blues_d'  # Adds a color palette for a cleaner look
)

# Add title and labels with enhanced formatting
plt.title("Average Transaction Amount by Transaction Type", fontsize=14, fontweight='bold')
plt.xlabel("Transaction Type", fontsize=12)
plt.ylabel("Average Amount", fontsize=12)

# Rotate the x-ticks for better readability and set fontsize
plt.xticks(rotation=45, ha='right', fontsize=10)

# Add gridlines for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# Tight layout for better spacing
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Get the top 10 or top 20 most frequent Transaction Sub Types
top_transaction_subtypes = Transactions['Transaction sub type'].value_counts().nlargest(10).index

# Filter the data to include only the top Transaction Sub Types
filtered_transactions = Transactions[Transactions['Transaction sub type'].isin(top_transaction_subtypes)]

# Plot the box plot for 'Amount' by 'Transaction Type' and 'Transaction Sub Type'
plt.figure(figsize=(14, 7))
sns.boxplot(x='Transaction Type', y='Amount', hue='Transaction sub type', data=filtered_transactions)
plt.title("Amount Distribution by Transaction Type and Sub Type (Top 10)")
plt.xlabel("Transaction Type")
plt.ylabel("Amount")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Get the top 10 most frequent Transaction Sub Types
top_transaction_subtypes = Transactions['Transaction sub type'].value_counts().nlargest(10).index

# Filter the data to include only the top Transaction Sub Types
filtered_transactions = Transactions[Transactions['Transaction sub type'].isin(top_transaction_subtypes)]

# Plot the box plot for 'Amount' by 'Transaction Type' and 'Transaction Sub Type'
plt.figure(figsize=(14, 7))

sns.boxplot(
    x='Transaction Type', 
    y='Amount', 
    hue='Transaction sub type', 
    data=filtered_transactions, 
    palette='Set2',  # Using a pleasant color palette
    showfliers=False,  # Hide outliers for cleaner visualization
    width=0.8  # Adjust box width for better visibility
)

# Add title and labels with enhanced formatting
plt.title("Amount Distribution by Transaction Type and Sub Type (Top 10)", fontsize=16, fontweight='bold', color='darkblue')
plt.xlabel("Transaction Type", fontsize=14)
plt.ylabel("Amount", fontsize=14)

# Rotate the x-ticks for better readability and set fontsize
plt.xticks(rotation=45, ha='right', fontsize=12)

# Add gridlines for better visual interpretation
plt.grid(True, axis='y', linestyle='--', alpha=0.6)

# Adjust the plot's layout for better spacing and alignment
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Get the top 10 or top 20 most frequent Transaction Sub Types
top_transaction_subtypes = Transactions['Transaction sub type'].value_counts().nlargest(10).index

# Filter the data to include only the top Transaction Sub Types
filtered_transactions = Transactions[Transactions['Transaction sub type'].isin(top_transaction_subtypes)]

# Plot the box plot for 'Amount' by 'Transaction Type' and 'Transaction Sub Type'
plt.figure(figsize=(14, 7))
sns.boxplot(x='Transaction Type', y='Amount', hue='Transaction sub type', data=filtered_transactions,
            fliersize=5, palette="Set2")

# Optionally customize how the outliers (dots) are plotted, coloring by 'Transaction Sub Type'
sns.scatterplot(x='Transaction Type', y='Amount', hue='Transaction sub type', data=filtered_transactions,
                palette="Set2", legend=False, marker='o', s=50, alpha=0.6)

plt.title("Amount Distribution by Transaction Type and Sub Type (Top 10)")
plt.xlabel("Transaction Type")
plt.ylabel("Amount")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Plot distribution of 'Nearest Metro' (Top 10)
top_nearest_metro = Transactions['Nearest Metro'].value_counts().nlargest(10).index
filtered_nearest_metro = Transactions[Transactions['Nearest Metro'].isin(top_nearest_metro)]

plt.figure(figsize=(8, 5))
sns.countplot(x='Nearest Metro', data=filtered_nearest_metro)
plt.title("Nearest Metro Distribution (Top 10)")
plt.xlabel("Nearest Metro")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Plot distribution of 'Nearest Mall' (Top 10)
top_nearest_mall = Transactions['Nearest Mall'].value_counts().nlargest(10).index
filtered_nearest_mall = Transactions[Transactions['Nearest Mall'].isin(top_nearest_mall)]

plt.figure(figsize=(8, 5))
sns.countplot(x='Nearest Mall', data=filtered_nearest_mall)
plt.title("Nearest Mall Distribution (Top 10)")
plt.xlabel("Nearest Mall")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Plot distribution of 'Nearest Landmark' (Top 10)
top_nearest_landmark = Transactions['Nearest Landmark'].value_counts().nlargest(10).index
filtered_nearest_landmark = Transactions[Transactions['Nearest Landmark'].isin(top_nearest_landmark)]

plt.figure(figsize=(8, 5))
sns.countplot(x='Nearest Landmark', data=filtered_nearest_landmark)
plt.title("Nearest Landmark Distribution (Top 10)")
plt.xlabel("Nearest Landmark")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Plot distribution of 'Master Project' (Top 10)
top_master_project = Transactions['Master Project'].value_counts().nlargest(10).index
filtered_master_project = Transactions[Transactions['Master Project'].isin(top_master_project)]

plt.figure(figsize=(8, 5))
sns.countplot(x='Master Project', data=filtered_master_project)
plt.title("Master Project Distribution (Top 10)")
plt.xlabel("Master Project")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Plot distribution of 'Project' (Top 10)
top_project = Transactions['Project'].value_counts().nlargest(10).index
filtered_project = Transactions[Transactions['Project'].isin(top_project)]

plt.figure(figsize=(8, 5))
sns.countplot(x='Project', data=filtered_project)
plt.title("Project Distribution (Top 10)")
plt.xlabel("Project")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()


In [None]:
# Correlation matrix for numeric columns
correlation_columns = ['Amount', 'Transaction Size (sq.m)', 'Property Size (sq.m)', 'No. of Buyer', 'No. of Seller']
correlation_matrix = Transactions[correlation_columns].corr()

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Numeric Features")
plt.show()


In [None]:
correlation_matrix

In [None]:
# Distribution of Transaction Types and Amounts

# Plot distribution of 'Transaction Type'
plt.figure(figsize=(8, 5))
sns.countplot(x='Transaction Type', data=Transactions)
plt.title("Transaction Type Distribution")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

# Plot distribution of 'Amount' (transaction value)
plt.figure(figsize=(8, 5))
sns.histplot(Transactions['Amount'], kde=True, bins=30)
plt.title("Transaction Amount Distribution")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Exploring Property Size and Room Analysis

# Plot distribution of 'Property Size (sq.m)'
plt.figure(figsize=(8, 5))
sns.histplot(Transactions['Property Size (sq.m)'], kde=True, bins=30)
plt.title("Property Size Distribution")
plt.xlabel("Property Size (sq.m)")
plt.ylabel("Frequency")
plt.show()

# Plot distribution of 'Room(s)' (after filling NaN values with 0)
plt.figure(figsize=(8, 5))
sns.histplot(Transactions['Room(s)'], kde=True, bins=30)
plt.title("Room(s) Distribution")
plt.xlabel("Number of Rooms")
plt.ylabel("Frequency")
plt.show()

# Scatter plot of Property Size vs Room(s)
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Property Size (sq.m)', y='Room(s)', data=Transactions)
plt.title("Property Size vs Number of Rooms")
plt.xlabel("Property Size (sq.m)")
plt.ylabel("Number of Rooms")
plt.show()


In [None]:
# Investigate Trends by Area and Property Type

# Get the top 10 areas based on the average transaction amount
top_areas = Transactions.groupby('Area')['Amount'].mean().nlargest(10)

# Plot the result for top 10 areas
plt.figure(figsize=(12, 6))
top_areas.plot(kind='bar', color='royalblue')
plt.title("Top 10 Areas with Highest Average Transaction Amount", fontsize=16, fontweight='bold')
plt.xlabel("Area", fontsize=14)
plt.ylabel("Average Transaction Amount", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()


# Average transaction amount by Property Type
property_type_transaction_avg = Transactions.groupby('Property Type')['Amount'].mean().sort_values(ascending=False)
property_type_transaction_avg.plot(kind='bar', figsize=(12, 6))
plt.title("Average Transaction Amount by Property Type")
plt.xlabel("Property Type")
plt.ylabel("Average Transaction Amount")
plt.xticks(rotation=45)
plt.show()


In [None]:


# Sample data to ensure faster visualization
sampled_transactions = Transactions.sample(n=10000, random_state=42)

# Convert relevant columns to numeric or datetime as needed
sampled_transactions['Transaction Size (sq.m)'] = pd.to_numeric(
    sampled_transactions['Transaction Size (sq.m)'], errors='coerce')
sampled_transactions['Amount'] = pd.to_numeric(
    sampled_transactions['Amount'], errors='coerce')
sampled_transactions['Transaction Date'] = pd.to_datetime(
    sampled_transactions['Transaction Date'], errors='coerce')

# Top 10 areas for visualization
top_areas_transactions = sampled_transactions['Area'].value_counts().nlargest(10).index
filtered_transactions = sampled_transactions[sampled_transactions['Area'].isin(top_areas_transactions)]

# Scatter plot: Amount vs Transaction Size (sq.m) with Area
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Transaction Size (sq.m)',
    y='Amount',
    data=filtered_transactions,
    hue='Area',
    alpha=0.6,
    palette='tab10'
)
plt.title("Amount vs Transaction Size (sq.m) by Area (Top 10 Areas)")
plt.legend(title="Area", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Box plot for Amount distribution by Area
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Amount', data=filtered_transactions, palette='muted')
plt.title("Amount Distribution by Area (Top 10 Areas)")
plt.xticks(rotation=45)
plt.show()

# Bar plot: Average Transaction Amount by Area
avg_amount_by_area = filtered_transactions.groupby('Area')['Amount'].mean().sort_values(ascending=False)
avg_amount_by_area.plot(kind='bar', figsize=(12, 6), color='skyblue', edgecolor='black')
plt.title("Average Transaction Amount by Area (Top 10 Areas)")
plt.ylabel("Average Amount")
plt.xlabel("Area")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Top 10 Most Frequent Categories

# Get top 10 most frequent values for categorical columns
top_usage = Transactions['Usage'].value_counts().nlargest(10).index
top_area = Transactions['Area'].value_counts().nlargest(10).index
top_property_type = Transactions['Property Type'].value_counts().nlargest(10).index
top_property_sub_type = Transactions['Property Sub Type'].value_counts().nlargest(10).index

# Filter the data for top categories
filtered_transactions_usage = Transactions[Transactions['Usage'].isin(top_usage)]
filtered_transactions_area = Transactions[Transactions['Area'].isin(top_area)]
filtered_transactions_property_type = Transactions[Transactions['Property Type'].isin(top_property_type)]
filtered_transactions_property_sub_type = Transactions[Transactions['Property Sub Type'].isin(top_property_sub_type)]


In [None]:

# Plot histograms for numeric columns like 'Amount' and 'Property Size (sq.m)'
numeric_columns = ['Amount', 'Property Size (sq.m)']

for col in numeric_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(Transactions[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
# Box Plots (By Categorical Features)

# Box Plot for Amount by Usage (Top 10 Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Usage', y='Amount', data=filtered_transactions_usage, palette='coolwarm')
plt.title("Amount Distribution by Usage (Top 10 Categories)")
plt.xticks(rotation=45)
plt.show()

# Box Plot for Amount by Area (Top 10 Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Area', y='Amount', data=filtered_transactions_area, palette='coolwarm')
plt.title("Amount Distribution by Area (Top 10 Categories)")
plt.xticks(rotation=45)
plt.show()

# Box Plot for Amount by Property Type (Top 10 Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Type', y='Amount', data=filtered_transactions_property_type, palette='coolwarm')
plt.title("Amount Distribution by Property Type (Top 10 Categories)")
plt.xticks(rotation=45)
plt.show()

# Box Plot for Amount by Property Sub Type (Top 10 Categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='Property Sub Type', y='Amount', data=filtered_transactions_property_sub_type, palette='coolwarm')
plt.title("Amount Distribution by Property Sub Type (Top 10 Categories)")
plt.xticks(rotation=45)
plt.show()


In [None]:

# Use a built-in Matplotlib style
plt.style.use('ggplot')  # You can also try 'bmh', 'fivethirtyeight', 'seaborn-whitegrid', etc.

# Function to create fancy box plots
def fancy_boxplot(data, x_col, y_col, title, figsize=(14, 7)):
    plt.figure(figsize=figsize)
    sns.boxplot(
        x=x_col, 
        y=y_col, 
        data=data, 
        palette='plasma',  # More vibrant color palette
        linewidth=1.5,  # Thicker boxplot lines for better visibility
        showfliers=False  # Hide outliers for a cleaner visualization
    )
    plt.title(title, fontsize=16, fontweight='bold', pad=15)
    plt.xlabel(x_col, fontsize=14)
    plt.ylabel(y_col, fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.6)  # Add subtle grid lines
    plt.tight_layout()
    plt.show()

# Generate fancy box plots
fancy_boxplot(filtered_transactions_usage, 'Usage', 'Amount', "Amount Distribution by Usage (Top 10 Categories)")
fancy_boxplot(filtered_transactions_area, 'Area', 'Amount', "Amount Distribution by Area (Top 10 Categories)")
fancy_boxplot(filtered_transactions_property_type, 'Property Type', 'Amount', "Amount Distribution by Property Type (Top 10 Categories)")
fancy_boxplot(filtered_transactions_property_sub_type, 'Property Sub Type', 'Amount', "Amount Distribution by Property Sub Type (Top 10 Categories)")


In [None]:
# Scatter Plots (By Categorical Features)

# Scatter plot: Amount vs Property Size (sq.m) by Usage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Property Size (sq.m)', y='Amount', data=Transactions, hue='Usage', alpha=0.6, palette='viridis')
plt.title("Amount vs Property Size (sq.m) by Usage")
plt.legend(title="Usage", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Scatter plot: Amount vs Property Size (sq.m) by Property Type
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Property Size (sq.m)', y='Amount', data=Transactions, hue='Property Type', alpha=0.6, palette='Set1')
plt.title("Amount vs Property Size (sq.m) by Property Type")
plt.legend(title="Property Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


# Scatter plot: Amount vs Property Size (sq.m) by Is Free Hold?
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Property Size (sq.m)', y='Amount', data=Transactions, hue='Is Free Hold?', alpha=0.6, palette='Paired')
plt.title("Amount vs Property Size (sq.m) by Free Hold Status")
plt.legend(title="Is Free Hold?", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:

# Use a visually appealing style
plt.style.use('ggplot')

# Function to create fancy scatter plots
def fancy_scatterplot(data, x_col, y_col, hue_col, title, palette, figsize=(12, 7)):
    plt.figure(figsize=figsize)
    sns.scatterplot(
        x=x_col, 
        y=y_col, 
        data=data, 
        hue=hue_col, 
        alpha=0.7,  # Slight transparency to reduce overlap
        palette=palette, 
        edgecolor='black', 
        linewidth=0.6  # Slight edge to points for clarity
    )
    plt.title(title, fontsize=16, fontweight='bold', pad=15)
    plt.xlabel(x_col, fontsize=14)
    plt.ylabel(y_col, fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(title=hue_col, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)  # Subtle grid lines
    plt.tight_layout()
    plt.show()

# Generate fancy scatter plots
fancy_scatterplot(Transactions, 'Property Size (sq.m)', 'Amount', 'Usage', 
                  "Amount vs Property Size (sq.m) by Usage", palette='viridis')

fancy_scatterplot(Transactions, 'Property Size (sq.m)', 'Amount', 'Property Type', 
                  "Amount vs Property Size (sq.m) by Property Type", palette='Set1')

fancy_scatterplot(Transactions, 'Property Size (sq.m)', 'Amount', 'Is Free Hold?', 
                  "Amount vs Property Size (sq.m) by Free Hold Status", palette='Paired')


In [None]:
# Get the top 10 most frequent Areas
top_areas = Transactions['Area'].value_counts().nlargest(10).index

# Filter the data to include only the top 10 Areas
filtered_transactions_area = Transactions[Transactions['Area'].isin(top_areas)]

# Scatter plot: Amount vs Property Size (sq.m) by Area (Top 10 Areas)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Property Size (sq.m)', y='Amount', data=filtered_transactions_area, hue='Area', alpha=0.6, palette='tab10')
plt.title("Amount vs Property Size (sq.m) by Area (Top 10 Areas)")
plt.legend(title="Area", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Get the top 20 most frequent Property Sub Types
top_property_sub_types = Transactions['Property Sub Type'].value_counts().nlargest(20).index

# Filter the data to include only the top 20 Property Sub Types
filtered_transactions_property_sub_type = Transactions[Transactions['Property Sub Type'].isin(top_property_sub_types)]

# Scatter plot: Amount vs Property Size (sq.m) by Property Sub Type (Top 20 Property Sub Types)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Property Size (sq.m)', y='Amount', data=filtered_transactions_property_sub_type, hue='Property Sub Type', alpha=0.6, palette='cool')
plt.title("Amount vs Property Size (sq.m) by Property Sub Type (Top 20 Property Sub Types)")
plt.legend(title="Property Sub Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:

# Use a visually appealing style
plt.style.use('ggplot')

# Function to create fancy scatter plots with log scale
def fancy_scatterplot_log(data, x_col, y_col, hue_col, title, palette, figsize=(12, 7)):
    plt.figure(figsize=figsize)
    ax = sns.scatterplot(
        x=x_col, 
        y=y_col, 
        data=data, 
        hue=hue_col, 
        alpha=0.7,  # Slight transparency to reduce overlap
        palette=palette, 
        edgecolor='black', 
        linewidth=0.6  # Slight edge to points for clarity
    )
    
    # Apply logarithmic scale to Amount
    ax.set_yscale('log')
    ax.set_xscale('log')

    plt.title(title, fontsize=16, fontweight='bold', pad=15)
    plt.xlabel(x_col, fontsize=14)
    plt.ylabel(f'Log-Scaled {y_col}', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(title=hue_col, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)  # Subtle grid lines
    plt.tight_layout()
    plt.show()

# Get the top 10 most frequent Areas
top_areas = Transactions['Area'].value_counts().nlargest(10).index
filtered_transactions_area = Transactions[Transactions['Area'].isin(top_areas)]

# Scatter plot: Amount vs Property Size (sq.m) by Area (Top 10 Areas) with log scale
fancy_scatterplot_log(filtered_transactions_area, 'Property Size (sq.m)', 'Amount', 'Area', 
                      "Amount vs Property Size (sq.m) by Area (Top 10 Areas)", palette='tab10')

# Get the top 20 most frequent Property Sub Types
top_property_sub_types = Transactions['Property Sub Type'].value_counts().nlargest(20).index
filtered_transactions_property_sub_type = Transactions[Transactions['Property Sub Type'].isin(top_property_sub_types)]

# Scatter plot: Amount vs Property Size (sq.m) by Property Sub Type (Top 20 Property Sub Types) with log scale
fancy_scatterplot_log(filtered_transactions_property_sub_type, 'Property Size (sq.m)', 'Amount', 'Property Sub Type', 
                      "Amount vs Property Size (sq.m) by Property Sub Type (Top 20)", palette='cool')


In [None]:

# Use a visually appealing style
plt.style.use('ggplot')

# Function to create fancy scatter plots with log scale
def fancy_scatterplot_log(data, x_col, y_col, hue_col, title, palette, figsize=(12, 7)):
    plt.figure(figsize=figsize)
    ax = sns.scatterplot(
        x=x_col, 
        y=y_col, 
        data=data, 
        hue=hue_col, 
        alpha=0.7,  # Slight transparency to reduce overlap
        palette=palette, 
        edgecolor='black', 
        linewidth=0.6  # Slight edge to points for clarity
    )
    
    # Apply logarithmic scale to Amount
    ax.set_yscale('log')
    ax.set_xscale('log')

    plt.title(title, fontsize=16, fontweight='bold', pad=15)
    plt.xlabel(x_col, fontsize=14)
    plt.ylabel(f'Log-Scaled {y_col}', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(title=hue_col, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)  # Subtle grid lines
    plt.tight_layout()
    plt.show()

# List of categorical columns to analyze
categorical_columns = [
    'Area', 'Property Sub Type', 'Registration type', 'Is Free Hold?', 'Usage', 
    'Property Type', 'Room(s)', 'Parking', 'Nearest Metro', 'Nearest Mall', 'Nearest Landmark'
]

# Number of top categories to consider for each feature
top_n = {'Area': 10, 'Property Sub Type': 20}  # Custom limits for specific features
default_top_n = 10  # Default limit

# Generate scatter plots for each categorical feature
for col in categorical_columns:
    top_categories = Transactions[col].value_counts().nlargest(top_n.get(col, default_top_n)).index
    filtered_data = Transactions[Transactions[col].isin(top_categories)]
    
    fancy_scatterplot_log(
        filtered_data, 'Property Size (sq.m)', 'Amount', col, 
        f"Amount vs Property Size (sq.m) by {col} (Top {top_n.get(col, default_top_n)})", 
        palette='tab10' if col in ['Area', 'Property Type'] else 'coolwarm'
    )


In [None]:
# List of columns to plot by
columns_to_plot_by = [
    'Area', 'Usage', 'Nearest Metro', 'Nearest Mall', 'Nearest Landmark', 
    'Transaction Type', 'Transaction sub type', 'Registration type', 
    'Master Project', 'Project'
]

# Loop over each column and create a plot for 'Amount' vs 'Property Size (sq.m)'
for column in columns_to_plot_by:
    # Get the top 10 most frequent values for the column
    top_values = Transactions[column].value_counts().nlargest(10).index
    
    # Filter the data to include only the top 10 values of the column
    filtered_data = Transactions[Transactions[column].isin(top_values)]
    
    # Plot 'Amount' vs 'Property Size (sq.m)' by the selected column
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='Property Size (sq.m)', y='Amount', hue=column, data=filtered_data, alpha=0.6, palette='tab10')
    plt.title(f"Amount vs Property Size (sq.m) by {column}")
    plt.xlabel("Property Size (sq.m)")
    plt.ylabel("Amount")
    plt.legend(title=column, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()


In [None]:
# Time series analysis

# Convert 'Transaction Date' to datetime if not already
Transactions['Transaction Date'] = pd.to_datetime(Transactions['Transaction Date'], errors='coerce')

# Group by month and year to analyze price trends
monthly_avg_price = Transactions.resample('M', on='Transaction Date')['Amount'].mean()

# Plot the monthly average price trend
plt.figure(figsize=(12, 6))
monthly_avg_price.plot()
plt.title("Monthly Average Transaction Amount")
plt.ylabel("Average Amount")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.show()


In [None]:


# Convert 'Transaction Date' to datetime and drop NaT values
Transactions['Transaction Date'] = pd.to_datetime(Transactions['Transaction Date'], errors='coerce')
Transactions = Transactions.dropna(subset=['Transaction Date'])

# Group by year and month, then calculate the mean transaction amount
monthly_avg_price = (
    Transactions.groupby(Transactions['Transaction Date'].dt.to_period('M'))['Amount']
    .mean()
    .reset_index()
)

# Convert 'Transaction Date' back to a datetime format for plotting
monthly_avg_price['Transaction Date'] = monthly_avg_price['Transaction Date'].dt.to_timestamp()

# Plot the monthly average price trend
sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
plt.plot(monthly_avg_price['Transaction Date'], monthly_avg_price['Amount'], marker='o', linestyle='-')

# Formatting
plt.title("Monthly Average Transaction Amount", fontsize=14)
plt.ylabel("Average Amount", fontsize=12)
plt.xlabel("Date", fontsize=12)
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
Transactions.info()