# Real Estate Price Prediction - Part 1: Data Exploration & Preprocessing

This notebook explores the real estate dataset and prepares it for modeling.

## Dataset Features:
- **price**: Target variable (price of property)
- **transaction**: Type of transaction (rent or sale)
- **city**: City location
- **region**: Region within city
- **surface**: Surface area of property
- **bathrooms**: Number of bathrooms
- **rooms**: Number of rooms
- **property_type**: Type of property (villa, studio, apartment)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Inspect Data

In [None]:
# Load the dataset - adjust the path to your CSV file
!pwd
!ls sample_data/
from google.colab import files
files.upload('final_real_estate_dataset.csv')


df = pd.read_csv('"/content/final_real_estate_dataset.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

/content
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


KeyboardInterrupt: 

In [None]:
# Basic information
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("\nBasic Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

## 2. Exploratory Data Analysis

In [None]:
# Distribution of target variable (price)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Original price distribution
axes[0].hist(df['price'], bins=50, edgecolor='black')
axes[0].set_title('Price Distribution')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Frequency')

# Log-transformed price distribution
axes[1].hist(np.log1p(df['price']), bins=50, edgecolor='black', color='orange')
axes[1].set_title('Log-Transformed Price Distribution')
axes[1].set_xlabel('Log(Price)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Price Statistics:")
print(f"Mean: {df['price'].mean():.2f}")
print(f"Median: {df['price'].median():.2f}")
print(f"Std: {df['price'].std():.2f}")
print(f"Min: {df['price'].min():.2f}")
print(f"Max: {df['price'].max():.2f}")

In [None]:
# Categorical features distribution
categorical_features = ['transaction', 'city', 'region', 'property_type']

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for idx, col in enumerate(categorical_features):
    value_counts = df[col].value_counts()
    axes[idx].bar(range(len(value_counts)), value_counts.values)
    axes[idx].set_xticks(range(len(value_counts)))
    axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
    axes[idx].set_title(f'{col.capitalize()} Distribution')
    axes[idx].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Numerical features distribution
numerical_features = ['surface', 'bathrooms', 'rooms']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(numerical_features):
    axes[idx].hist(df[col], bins=30, edgecolor='black', color='skyblue')
    axes[idx].set_title(f'{col.capitalize()} Distribution')
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Price by categorical features
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for idx, col in enumerate(categorical_features):
    df.groupby(col)['price'].mean().plot(kind='bar', ax=axes[idx], color='coral')
    axes[idx].set_title(f'Average Price by {col.capitalize()}')
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Average Price')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
numerical_cols = ['price', 'surface', 'bathrooms', 'rooms']
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix - Numerical Features')
plt.show()

In [None]:
# Scatter plots: numerical features vs price
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(numerical_features):
    axes[idx].scatter(df[col], df['price'], alpha=0.5)
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Price')
    axes[idx].set_title(f'Price vs {col.capitalize()}')

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Handle missing values if any
# For numerical features: fill with median
for col in numerical_features:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col].fillna(df_processed[col].median(), inplace=True)

# For categorical features: fill with mode
for col in categorical_features:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)

print("Missing values after handling:")
print(df_processed.isnull().sum())

In [None]:
# Detect and handle outliers (optional - using IQR method)
def remove_outliers_iqr(df, column, multiplier=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(f"Original dataset size: {len(df_processed)}")

# Remove outliers from price (you can adjust or skip this)
df_processed = remove_outliers_iqr(df_processed, 'price', multiplier=3)

print(f"Dataset size after outlier removal: {len(df_processed)}")
print(f"Removed {len(df) - len(df_processed)} outliers")

In [None]:
# Encode categorical variables
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    df_processed[f'{col}_encoded'] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    
    # Print encoding mapping
    print(f"\n{col} encoding:")
    for idx, label in enumerate(le.classes_):
        print(f"  {label} -> {idx}")

In [None]:
# Prepare feature matrix and target variable
feature_columns = ['surface', 'bathrooms', 'rooms', 
                   'transaction_encoded', 'city_encoded', 
                   'region_encoded', 'property_type_encoded']

X = df_processed[feature_columns]
y = df_processed['price']

print("Feature matrix shape:", X.shape)
print("Target variable shape:", y.shape)
print("\nFeatures used:")
print(feature_columns)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_columns, index=X_test.index)

print("Data scaling completed!")

In [None]:
# Save preprocessed data for modeling
import pickle

# Save the preprocessed data
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train,
        'X_test': X_test,
        'X_train_scaled': X_train_scaled,
        'X_test_scaled': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler,
        'label_encoders': label_encoders,
        'feature_columns': feature_columns
    }, f)

print("Preprocessed data saved to 'preprocessed_data.pkl'")

## Summary

In this notebook, we:
1. Loaded and explored the real estate dataset
2. Visualized distributions and relationships
3. Handled missing values and outliers
4. Encoded categorical variables
5. Split data into train/test sets
6. Scaled numerical features
7. Saved preprocessed data for modeling

Next steps: Use the preprocessed data in modeling notebooks to test different algorithms!