# Fix Region Spellings
Notebook to load cleaned_real_estate.csv, manually fix region spellings, validate updates, and save the revised file to the processed folder.

## 1. Import Libraries and Locate Dataset

In [13]:
import pandas as pd
from pathlib import Path



DATA_PATH = r"C:\Users\user\OneDrive\Bureau\Data Mining Projecy\Tunisan-Real-Estate-Price-Prediction-Platform\ML\data\processed\cleaned_real_estate.csv"

## 2. Load cleaned_real_estate.csv

In [14]:
df = pd.read_csv(DATA_PATH)

print(f"Rows: {len(df):,}")
print(f"Columns: {list(df.columns)}")

Rows: 23,698
Columns: ['price', 'transaction', 'region', 'surface', 'bathrooms', 'rooms', 'property_type', 'has_piscine', 'has_garage', 'has_jardin', 'has_terrasse', 'has_ascenseur', 'is_meuble', 'has_chauffage', 'has_climatisation']


## 4. Apply Manual Region Spelling Fixes

In [15]:
df = df[df["region"] != "Autres Villes"].copy()
print(f"Rows after drop: {len(df):,}")

Rows after drop: 23,611


In [16]:
import re
import numpy as np

# Normalize all "Cit" and "Cite" variations to "Cité"
df['region'] = df['region'].str.replace(r'\bCit\b', 'Cité', case=False, regex=True)
df['region'] = df['region'].str.replace(r'\bCite\b', 'Cité', case=False, regex=True)

# Normalize all "Gabs" to "Gabes" and "Bja" to "Beja"
df['region'] = df['region'].str.replace(r'\bGabs\b', 'Gabes', case=False, regex=True)
df['region'] = df['region'].str.replace(r'\bBja\b', 'Beja', case=False, regex=True)

# Handle "M Saken" (variable spaces) -> "Msaken"
df['region'] = df['region'].str.replace(r'\bM\s+Saken\b', 'Msaken', case=False, regex=True)

# Normalize specific spellings
region_fix = {
    "Ennasr": "Cité Ennasr 1",
    "Dar Chaabane El Fehri": "Dar Chabane El Fehri",
    "El Omrane Suprieur": "El Omrane",
    "El Omrane Superieur": "El Omrane",
    "Hammam Ghezze": "Hammam Ghezeze",
    "Kala Kebira": "Kalaa Kebira",
    "Kala Sghira": "Kalaa Sghira",
    "Ksibet El Mdiouni": "Ksibet El Mediouni",
    "Mahrs": "Mahres",
    "Menzah": "El Menzah 4",
    "Bizerte": "Bizerte Sud",
    "Route Menzel Chaker": "Route Manzel Chaker",
    "Route Soukra": "Route Sokra",
    "Sakiet Eddaer": "Sakiet Eddaier",
    "Sfax": "Sfax Ville",
    "Sfax Mdina": "Sfax Ville",
    "Sidi El Bchir": "Sidi El Bechir",
    "Sousse Mdina": "Sousse Medina",
    "Tozeur": "Tozeur ville",
    "Centre Ville Lafayette": "Lafayette",
    "Tunis": "Centre Ville",
    "Tunis Belvedere": "Belvedere",
    "La Manouba": "Manouba Ville",
    "Beja": "Beja Sud",
    "Beni Khiar": "Bni Khiar",
    "Charguia 1": "Charguia",
    "Charguia 2": "Charguia",
    "Gabes": "Gabes Nord",
    "Kairouan": "Kairouan ville",
    "Monastir": "Monastire Ville",
    "Nabeul": "Nabeul Ville",
    "Route De Laroport": "Route De L'Aeroport",
    "Yasmine Hammamet": "Hammamet",
    "Zaghouan": "Zaghouane Ville",
}

# Apply mapping
df['region_original'] = df['region']
df['region'] = df['region'].replace(region_fix)

# Split helper
rng = np.random.default_rng(42)

def split_equally(mask, targets):
    idx = df.index[mask]
    if len(idx) == 0 or len(targets) == 0:
        return
    shuffled = rng.permutation(idx)
    for i, row_idx in enumerate(shuffled):
        df.at[row_idx, 'region'] = targets[i % len(targets)]

# Split Jardins El Menzah -> Les Jardins El Menzah 1/2
split_equally(df['region'] == 'Jardins El Menzah', ['Les Jardins El Menzah 1', 'Les Jardins El Menzah 2'])

# Split Manar -> El Manar 1/2
split_equally(df['region'] == 'Manar', ['El Manar 1', 'El Manar 2'])

# Split Ariana -> Ariana Essoughra / Ariana Ville / Nouvelle Ariana
split_equally(df['region'] == 'Ariana', ['Ariana Essoughra', 'Ariana Ville', 'Nouvelle Ariana'])

# Split Sousse across all regions that contain "Sousse" in their name (excluding "Sousse" itself)
sousse_targets = sorted([r for r in df['region'].unique() if isinstance(r, str) and 'sousse' in r.lower() and r.lower() != 'sousse'])
split_equally(df['region'].str.lower() == 'sousse', sousse_targets)

# Split Ain Zaghouen -> Ain Zaghouan Nord/Sud
split_equally(df['region'] == 'Ain Zaghouen', ['Ain Zaghouan Nord', 'Ain Zaghouan Sud'])

# Split Chotrana -> Chotrana 1/2/3
split_equally(df['region'] == 'Chotrana', ['Chotrana 1', 'Chotrana 2', 'Chotrana 3'])

# Split El Mourouj -> El Mourouj 1, 3, 4, 5, 6
split_equally(df['region'] == 'El Mourouj', ['El Mourouj 1', 'El Mourouj 3', 'El Mourouj 4', 'El Mourouj 5', 'El Mourouj 6'])

# Drop specific regions and any region with <= 2 rows
drop_regions = {
    'Mareth',
    'Mdenine Nord',
    'Mdenine Sud',
    'Medenine',
    'Medenine Nord',
    'Regueb',
    'Ouerdanine',
    'Mtouia',
}

df = df[~df['region'].isin(drop_regions)].copy()

region_counts = df['region'].value_counts()
df = df[df['region'].map(region_counts) > 2].copy()

print("Applied normalization, mappings, splits, and drops")
print(f"Unique regions after normalization: {df['region'].nunique()}")
print(f"Rows after drops: {len(df):,}")

Applied normalization, mappings, splits, and drops
Unique regions after normalization: 239
Rows after drops: 23,510


In [17]:
pd.set_option('display.max_rows', None)

region_counts = df['region'].value_counts().sort_index()

print(f"Total unique regions: {len(region_counts)}")
print("\nAll unique regions:")
print(region_counts)

Total unique regions: 239

All unique regions:
region
Agba                               13
Aghir                              22
Ain Draham                          5
Ain Zaghouan Nord                 198
Ain Zaghouan Sud                   88
Akouda                            274
Alain Savary                       12
Ariana Essoughra                   63
Ariana Ville                      796
Bab Souika                         12
Barraket Essahel                   11
Beja Nord                           3
Beja Sud                            9
Bekalta                            10
Bellevue                            4
Belvedere                          22
Ben Arous                          65
Beni Khalled                        6
Bizerte Nord                      114
Bizerte Sud                       122
Bni Khalled                         5
Bni Khiar                          82
Borj Cedria                        41
Borj El Amri                        8
Borj Louzir                       

In [18]:
output_path = r"C:\Users\user\OneDrive\Bureau\Data Mining Projecy\Tunisan-Real-Estate-Price-Prediction-Platform\ML\data\processed\cleaned_real_estate_fixed.csv"
df.drop(columns=['region_original'], errors='ignore').to_csv(output_path, index=False)

print(f"✅ Saved to: {output_path}")

✅ Saved to: C:\Users\user\OneDrive\Bureau\Data Mining Projecy\Tunisan-Real-Estate-Price-Prediction-Platform\ML\data\processed\cleaned_real_estate_fixed.csv
