# Tunisian Real Estate - Data Visualization

End-to-end EDA notebook: load processed data, explore distributions, compare sale vs rent, and build static/interactive plots.

In [None]:
# Imports and plotting setup
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
sns.set(style='whitegrid', context='notebook')

## Load data
Use the processed dataset stored at `ML/data/processed/final_real_estate_dataset.csv`.

In [None]:
# Load dataset
data_path = Path('data/processed/final_real_estate_dataset.csv')
df = pd.read_csv(data_path)
print(f"Rows: {len(df):,}, Columns: {df.shape[1]}")
display(df.head())
display(df.describe(include='all').T[['count','unique','mean','std','min','max']].fillna(''))
df.isna().mean().pipe(lambda s: s[s>0]).to_frame('missing_ratio')

## Basic plots
Price distribution and simple relationships.

In [None]:
# Price distribution (overall and log-scaled)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df['price'], bins=50, ax=axes[0], color='steelblue')
axes[0].set_title('Price distribution')
sns.histplot(np.log1p(df['price']), bins=50, ax=axes[1], color='darkorange')
axes[1].set_title('Log(price+1) distribution')
plt.tight_layout()
plt.show()

# Price vs surface by transaction
plt.figure(figsize=(6, 5))
sns.scatterplot(data=df, x='surface', y='price', hue='transaction', alpha=0.4)
plt.title('Price vs Surface')
plt.xscale('log')
plt.yscale('log')
plt.tight_layout()
plt.show()

# Top 10 cities by count
top_cities = df['city'].value_counts().nlargest(10).index
plt.figure(figsize=(6, 4))
sns.countplot(data=df[df['city'].isin(top_cities)], y='city', order=top_cities, palette='viridis')
plt.title('Top 10 cities by listings')
plt.tight_layout()
plt.show()

## Customized plots
Compare sale vs rent with clearer labels and scales.

In [None]:
# Boxplot by transaction
plt.figure(figsize=(6, 5))
sns.boxplot(data=df, x='transaction', y='price', palette='Set2')
plt.yscale('log')
plt.title('Price by transaction (log-scale)')
plt.xlabel('Transaction type')
plt.ylabel('Price (log scale)')
plt.tight_layout()
plt.show()

# Violin plot by property type
plt.figure(figsize=(7, 5))
sns.violinplot(data=df, x='property_type', y='price', palette='Set3', cut=0)
plt.yscale('log')
plt.title('Price by property type (log-scale)')
plt.xlabel('Property type')
plt.ylabel('Price (log scale)')
plt.tight_layout()
plt.show()

## Subplots
Side-by-side comparisons for sale vs rent.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
for ax, trx in zip(axes, ['sale', 'rent']):
    subset = df[df['transaction'] == trx]
    sns.histplot(np.log1p(subset['price']), bins=40, ax=ax, color='steelblue')
    ax.set_title(f'Log(price+1) - {trx}')
    ax.set_xlabel('log(price+1)')
axes[0].set_ylabel('count')
plt.tight_layout()
plt.show()

## Statistical visualizations
Correlations and distribution summaries.

In [None]:
# Correlation heatmap
numeric_cols = ['price', 'surface', 'bathrooms', 'rooms']
df_corr = df[numeric_cols].copy()
df_corr['surface_per_room'] = df_corr['surface'] / (df_corr['rooms'] + 1)
df_corr['rooms_per_bathroom'] = df_corr['rooms'] / (df_corr['bathrooms'] + 1)
corr = df_corr.corr()
plt.figure(figsize=(7, 5))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation heatmap (numeric features)')
plt.tight_layout()
plt.show()

## Interactive visualizations (Plotly)
Zoom, hover, and filter easily.

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook'

# Scatter: price vs surface
fig = px.scatter(df, x='surface', y='price', color='transaction', hover_data=['city','region','property_type'],
                 title='Price vs Surface (interactive)', log_x=True, log_y=True, opacity=0.5)
fig.show()

# Box: price by transaction
fig = px.box(df, x='transaction', y='price', points='outliers', log_y=True, color='transaction',
              title='Price by transaction (interactive, log scale)')
fig.show()

# Top regions by median price
top_regions = df.groupby('region')['price'].median().nlargest(15).index
fig = px.bar(df[df['region'].isin(top_regions)].groupby('region')['price'].median().sort_values(),
              title='Top 15 regions by median price')
fig.update_layout(xaxis_title='Region', yaxis_title='Median price')
fig.show()