# Data Exploration
This notebook conducts an exploratory data analysis (EDA) of the House Price Prediction Challenge dataset to establish a foundational understanding of the data characteristics, quality, and structure prior to model development. The primary objectives are to:
- Assess Data Quality
- Characterise Feature Distributions
- Identify Data Anomalies
- Understand Feature Relationships
- Esablish Baseline Understanding

In [1]:
# Setup imports
import config
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
# Load data from csv into dataframe
data_df = pd.read_csv(config.RAW_DATA_FILE, index_col=0)

In [3]:
# View the first few rows of data to check column headers and value types
print("DataFrame")
display(data_df.head())

DataFrame


Unnamed: 0_level_0,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
POSTED_BY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [4]:
def create_data_quality_summary(df: pd.DataFrame, dataset_name: str) -> dict:
  """Create a data quality summary report.

  Parameters
  ----------
  df : pandas.DataFrame
    Input dataframe to analyse
  dataset_name : str
    Name your dataset for display
  
  Returns
  ----------
  dict: Dictionary containing key data quality metrics 
  """
  # Basic dataset characteristics
  total_cells = len(df) * len(df.columns)
  missing_cells = df.isnull().sum().sum()

  # Memory usage calculation
  memory_usage = df.memory_usage(deep=True).sum() / (1024**2) # Convert to MB

  # Feature type breakdown
  numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
  categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

  # Data quality metrics
  summary = {
    'Dataset': dataset_name,
    'Rows': len(df),
    'Columns': len(df.columns),
    'Total Cells': total_cells,
    'Missing Cells': missing_cells,
    'Missing %': round((missing_cells / total_cells) * 100, 2),
    'Complete Rows': len(df.dropna()),
    'Complete %': round((len(df.dropna()) / len(df)) * 100, 2),
    'Duplicate Rows': df.duplicated().sum(),
    'Duplicate %': round((df.duplicated().sum() / len(df)) * 100, 2),
    'Numerical Features': len(numerical_features),
    'Categorical Features': len(categorical_features),
    'Memory Usage (MB)': round(memory_usage, 2),
    'Unique Rows': len(df.drop_duplicates()),
    'Data Density %': round(((total_cells - missing_cells) / total_cells) * 100, 2)
  }

  return summary

In [5]:
# Create a Data Quality summary for each dataset and display
dq_summary = create_data_quality_summary(data_df, 'House Prices Dataset')
quality_summary = pd.DataFrame(dq_summary, index=[0])

print("Data Quality Summary")
display(quality_summary)

Data Quality Summary


Unnamed: 0,Dataset,Rows,Columns,Total Cells,Missing Cells,Missing %,Complete Rows,Complete %,Duplicate Rows,Duplicate %,Numerical Features,Categorical Features,Memory Usage (MB),Unique Rows,Data Density %
0,House Prices Dataset,29451,11,323961,0,0.0,29451,100.0,457,1.55,9,2,7.64,28994,100.0


In [6]:
# Save data quality results to file
quality_summary.to_csv(config.RESULTS_DIR / 'data_quality_summary.csv', index=False)