In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 

In [None]:
# Load the CSV file into a DataFrame
data = pd.read_csv("/Users/sushil0711/Downloads/messy_housing_data.csv")
data.head()

In [None]:
# Check number of rows and columns
data.shape

In [None]:
# Standardize column names and rename important columns for clarity
data.columns = data.columns.str.capitalize()
data.rename(columns = {"Price" : "Price($)",
                    "Year_built" : "Year_built(AD)"}, inplace = True)
data.head()

In [None]:
# Data overview and memory usage
data.info()

In [None]:
# Check data types of columns 
data.dtypes

In [None]:
# Count and remove duplicate rows
data.duplicated().sum()
data.drop_duplicates(inplace = True)

In [None]:
# Convert Price to numeric, handling errors as NaN
data["Price($)"] = pd.to_numeric(data["Price($)"], errors = "coerce")
data.dtypes

In [None]:
# Statistical overview of numeric columns
data.describe()

In [None]:
# Check missing values in data 
data.isnull().sum()

In [None]:
# Filling numeric missing values 
data["Bedrooms"].fillna(data["Bedrooms"].median(), inplace = True)
data["Year_built(AD)"].fillna(data["Year_built(AD)"].median(), inplace = True)
data["Price($)"].fillna(data["Price($)"].median(), inplace = True)
data.head() 

In [None]:
# Checking and filling categorical values in data 
data["Location"].isnull().sum()
data["Location"].fillna(data["Location"].mode()[0], inplace = True)

In [None]:
data[["Bedrooms", "Year_built(AD)"]] = data[["Bedrooms", "Year_built(AD)"]].astype(int)
data.head()

In [None]:
data["Location"].unique()

In [None]:
# Standardize and inspect Location values
data["Location"] = data["Location"].str.strip().str.title()
data["Location"].unique()

In [None]:
# Standardize city names and preview
data.loc[data["Location"].str.contains("nyc|NYC|Nyc", na = False), "Location"] = "New York City"
data.loc[data["Location"].str.contains("la|La|L.A.", na = False), "Location"] = "Los Angeles"
data.loc[data["Location"].str.contains("Sf|sf", na = False), "Location"] = "San Francisco"
data.loc[data["Location"].str.contains("Bostan", na = False), "Location"] = "Boston"
data.loc[data["Location"].str.contains("Chicgo", na = False), "Location"] = "Chicago"
data.head() 


In [None]:
# Check and remove the outliers from data 
numeric_cols = data.select_dtypes(include = "number").columns
mask = pd.Series(True, index = data.index)
for cols in numeric_cols:
    Q1 = data[cols].quantile(0.25)
    Q3 = data[cols].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR 

    mask &= (data[cols] > lower) & (data[cols] < upper)

data = data[mask]
data.shape

In [None]:
# Visualize distribution and scale of key metrics
fig , ax = plt.subplots(1, 5, figsize = (28, 8))

label = ["Area_sqft", "Bedrooms", "Bathrooms", "Year_built(AD)", "Price($)"]
for i, j in enumerate(label):
    ax[i].boxplot(
        data[j],
        notch = True,
        patch_artist = True,
        labels = [j]
    )
plt.tight_layout()
plt.show()

In [None]:
columns = ["Area_sqft", "Bedrooms", "Bathrooms", "Year_built(AD)", "Price($)"]
colors = ["blue", "red", "green", "skyblue", "orange"]
xlabels = ["Area (sqft)", "Bedrooms", "Bathrooms", "Year Built", "Price"]

fig, ax = plt.subplots(1, 5, figsize = (28, 8)) 

for i, j in enumerate(columns):
    ax[i].hist(data[j], bins = 15, color = colors[i])
    ax[i].set_xlabel(xlabels[i])
    ax[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show() 

In [None]:
counts = data["Location"].value_counts()
fig, ax = plt.subplots(figsize=(10, 5))

ax.pie(
    counts.values,
    labels = counts.index,
    autopct = "%1.1f%%",
    explode = [0.1,0.1,0.1,0.1,0.1,0.1,0.1],
    shadow = True
)
ax.set_title("Location Distribution")
plt.tight_layout()
plt.show()