In [1]:
import pandas as pd
import random
from faker import Faker

# Initialize faker
fake = Faker("en_US")

# Define parameters
num_rows = 5000
locations = {
    "Nairobi": (-1.2921, 36.8219),
    "Mombasa": (-4.0435, 39.6682),
    "Kisumu": (-0.0917, 34.7680),
    "Nakuru": (-0.3031, 36.0800),
    "Eldoret": (0.5204, 35.2698),
    "Thika": (-1.0333, 37.0700),
    "Naivasha": (-0.7167, 36.4333),
    "Malindi": (-3.2170, 40.1169),
    "Garissa": (-0.4569, 39.6583),
    "Wajir": (1.7500, 40.0667)
}
property_types = ["Apartment", "House", "Villa", "Townhouse", "Bungalow"]
furnished_statuses = ["Furnished", "Semi-Furnished", "Unfurnished"]

# Function to generate random amenities
def generate_amenities():
    amenities_list = ["Pool", "Gym", "Garden", "Parking", "Security", "Playground"]
    return ", ".join(random.sample(amenities_list, random.randint(1, 4)))

# Generate dataset
data = []
for i in range(num_rows):
    location = random.choice(list(locations.keys()))
    lat, lon = locations[location]
    # Add some variation to coordinates
    latitude = round(lat + random.uniform(-0.05, 0.05), 6)
    longitude = round(lon + random.uniform(-0.05, 0.05), 6)

    property_type = random.choice(property_types)
    bedrooms = random.randint(1, 6)
    bathrooms = random.randint(1, 5)
    square_feet = random.randint(400, 4000)
    year_built = random.randint(2018, 2024)
    parking = random.randint(0, 3)
    furnished_status = random.choice(furnished_statuses)
    amenities = generate_amenities()
    
    # Price formula (realistic based on location + size + bedrooms)
    base_price = square_feet * random.randint(5000, 8000)  # per sq. ft. in KES
    location_multiplier = {
        "Nairobi": 1.6, "Mombasa": 1.4, "Kisumu": 1.2, "Nakuru": 1.1, "Eldoret": 1.0,
        "Thika": 0.9, "Naivasha": 0.95, "Malindi": 1.0, "Garissa": 0.7, "Wajir": 0.6
    }
    price = int(base_price * location_multiplier[location] + (bedrooms * 100000))

    data.append([
        i+1, location, property_type, bedrooms, bathrooms, square_feet,
        year_built, parking, furnished_status, amenities,
        latitude, longitude, price
    ])

# Create DataFrame
columns = [
    "property_id", "location", "property_type", "bedrooms", "bathrooms",
    "square_feet", "year_built", "parking", "furnished_status",
    "amenities", "latitude", "longitude", "price"
]
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv("kenya_real_estate.csv", index=False)

print("✅ Dataset generated: kenya_real_estate.csv with", num_rows, "rows")
print(df.head(10))


✅ Dataset generated: kenya_real_estate.csv with 5000 rows
   property_id  location property_type  bedrooms  bathrooms  square_feet  \
0            1    Nakuru     Townhouse         4          5         2653   
1            2    Nakuru         House         5          4         2506   
2            3     Thika     Apartment         5          5         2473   
3            4   Nairobi     Apartment         6          1          483   
4            5   Garissa     Apartment         1          3         1546   
5            6    Nakuru     Apartment         5          4         1783   
6            7  Naivasha      Bungalow         4          2          506   
7            8   Eldoret         Villa         5          4         1467   
8            9   Nairobi         House         5          5         1671   
9           10    Nakuru         House         6          4         1314   

   year_built  parking furnished_status                           amenities  \
0        2024        1   S