# Data Exploration, Pre-processing and Wrangling

BITS F464 Assignment 1

### Installing and Importing Dependencies

In [None]:
%pip install -q numpy
%pip install -q pandas
%pip install -q matplotlib
%pip install -q seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io

### Task 2

In [None]:
# Task 2
df = pd.read_csv('./data/housing.csv');

print("Number of rows: ",df.shape[0]);

df.head()

### Task 3

In [None]:
print("Descriptive Statistics: \n")
print(df.describe())
print("\nMean: \n")
print(df.mean(numeric_only=True))
print("\nMedian: \n")
print(df.median(numeric_only=True))
print("\nMode: \n")
print(df.mode(numeric_only=True))

### Task 4

In [None]:
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        stats = {
            'min': df[col].min(),
            'max': df[col].max(),
            'mean': df[col].mean(),
            'median': df[col].median(),
            'std_dev': df[col].std()
        }

        plt.figure(figsize=(4, 2.5))
        plt.plot(list(stats.keys()), list(stats.values()), marker='o', color='blue', linestyle='-', linewidth=2)
        plt.title(f'Statistics for {col}')
        plt.ylabel('Value')
        plt.grid(True)
        plt.show()

        print("\n");


# ALTERNATIVELY
# plt.plot(df.describe())
# plt.show()

### Task 5 and 6

In [None]:
def fun(col):
    stats = {
        'min': df[col].min(),
        'max': df[col].max(),
        'mean': df[col].mean(),
        'median': df[col].median(),
        'std_dev': df[col].std()
    }
    print("min: ",df[col].min())
    print("max: ",df[col].max())
    print("mean: ",df[col].mean())
    print("median: ",df[col].median())
    print("std_dev: ",df[col].std())
    plt.figure(figsize=(4, 2.5))
    plt.plot(list(stats.keys()), list(stats.values()), marker='o', color='blue', linestyle='-', linewidth=2)
    plt.title(f'Statistics for {col}')
    plt.ylabel('Value')
    plt.grid(True)
    plt.show()


In [None]:

# print(housingData.isna().sum()) #if sum is taken, if even one value is missing, its isna() will return True, which when added up with all the other False values, will give True

for col in df.columns:

    if df[col].isna().sum() > 0:
        print("\n"+col)
        print("Before: \n")
        fun(col) #print stats for column before imputing
        df[col].fillna(df[col].median(), inplace=True)
        print("After: \n")
        fun(col) #print stats for column after inputing

# print("\n"+housingData.isna().sum()) #to confirm that our imputation worked

### Task 7

In [None]:
print("Initial number of rows: "+str(df.shape[0]))
has_duplicates = df.duplicated().any()
print(f"Are there any duplicate rows? {has_duplicates}")
duplicatesRemoved=df.copy()
duplicatesRemoved.drop_duplicates()
print("Final number of rows: "+str(duplicatesRemoved.shape[0]))

### Task 8

In [None]:
scatterplot = plt.scatter(df['longitude'],df['latitude'])

colorbar = plt.colorbar(scatterplot)
colorbar.set_label('Median House Value')

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Scatter Plot of Housing Data: Longitude vs. Latitude')

plt.show()


### Task 9

In [None]:
print(df[df['total_bedrooms'] < 1500].shape[0])
print(df[df['total_bedrooms'] >= 1500].shape[0])

There are 19779 entries with total bedrooms value is less than 1500.

In [None]:
#Plotting number of houses having total bedrooms less than 1500
houses_bedrooms_less_than_1500 = df[df['total_bedrooms'] < 1500]
houses_bedrooms_greater_equal_1500 = df[df['total_bedrooms'] >= 1500]

plt.figure(figsize=(10, 6))

plt.hist(houses_bedrooms_less_than_1500['total_bedrooms'], bins=20, alpha=0.5, label='Bedrooms < 1500', color='green')

plt.hist(houses_bedrooms_greater_equal_1500['total_bedrooms'], bins=20, alpha=0.5, label='Bedrooms ≥ 1500', color='red')

plt.xlabel('Total Bedrooms')
plt.ylabel('Number of Houses')
plt.title('Histogram of Houses by Total Bedrooms')
plt.legend()
plt.show()

Plotting number of houses having median house value > $100000

In [None]:
k=df['median_house_value']
k.mean()

In [None]:
#Plotting number of houses having median house value above $100,000
house_value_less_than_100000=df[df['median_house_value'] < 100000]
print(house_value_less_than_100000.shape[0])
house_value_gr_than_100000=df[df['median_house_value'] >= 100000]
print(house_value_gr_than_100000.shape[0])

In [None]:
houses_value_less_than_100000 = df[df['median_house_value'] < 100000]
houses_value_greater_equal_100000 = df[df['median_house_value'] >= 100000]

plt.figure(figsize=(10, 6))

plt.hist(houses_value_less_than_100000['median_house_value'], bins=20, alpha=0.5, label='Value < $100,000', color='blue')

plt.hist(houses_value_greater_equal_100000['median_house_value'], bins=20, alpha=0.5, label='Value ≥ $100,000', color='orange')

plt.xlabel('Median House Value')
plt.ylabel('Number of Houses')
plt.title('Histogram of Houses by Median House Value')
plt.legend()
plt.show()

In [None]:
#Number of houses having population less than mean
population_mean=df['population'].mean()
population_mean

In [None]:
houses_population_less_than_mean = df[df['population'] < population_mean]
houses_population_greater_equal_mean = df[df['population'] >= population_mean]
print(houses_population_less_than_mean.shape[0])
print(houses_population_greater_equal_mean.shape[0])

In [None]:
plt.figure(figsize=(10, 6))

plt.hist(houses_population_less_than_mean['population'], bins=20, alpha=0.5, label='Population < mean', color='red')

plt.hist(houses_population_greater_equal_mean['population'], bins=20, alpha=0.5, label='Population ≥ mean', color='blue')

plt.xlabel('Population')
plt.ylabel('Number of Houses')
plt.title('Histogram of Houses by Population (< mean vs ≥ mean)')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='ocean_proximity', data=df,color='blue')
plt.title('Distribution of Houses by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Number of Houses')
plt.show()

### Task 10

In [None]:
from sklearn.preprocessing import LabelEncoder

#Grouping the data by ocean_proximity abd calculating summary statistics for each group
grouped = df.groupby('ocean_proximity').describe()
print(grouped)

In [None]:
#Encoding categorical ocean_proximity values into numerical representations for potential modeling purposes
ocean_proximity_le = LabelEncoder()
df['ocean_proximity'] = ocean_proximity_le.fit_transform(df['ocean_proximity'])

In [None]:
#Visualizing the distribution of a different target variables across different ocean_proximity categories.
sns.set(style="whitegrid")

numerical_features = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                      'population', 'households', 'median_income', 'median_house_value']

for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='ocean_proximity', y=feature, data=df)
    plt.title(f'Distribution of {feature} by Ocean Proximity')
    plt.xlabel('Ocean Proximity')
    plt.ylabel(feature)
    plt.xticks(ticks=range(len(ocean_proximity_le.classes_)), labels=ocean_proximity_le.classes_)
    plt.show()

### Task 11

In [None]:
#Creating a correlation heatmap of the dataset to identify relationships between numerical features
numerical_features = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                      'population', 'households', 'median_income', 'median_house_value']

correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
#Box plots for all numerical features to visualize their distribution and potential outliers
numerical_columns = ['longitude', 'latitude', 'housing_median_age',
                     'total_rooms', 'total_bedrooms', 'population',
                     'households', 'median_income', 'median_house_value']

df[numerical_columns].plot(kind='box', subplots=True, layout=(3,3), figsize=(15, 10))

plt.suptitle('Boxplots of Numerical Attributes')
plt.show()

#Drawing Insights from heatmap and boxplots

The following features have **strong correlation** with each other:

•	*total_rooms and total_bedrooms*

•	*total_rooms and households*

•	*total_rooms and population*

•	*total_bedrooms and population*

•	*total_bedrooms and households*

•	*population and households*

•	*median_income and median_house_value*





**Outlier Detection:**

**Total Rooms, Total Bedrooms, Population, and Households:**

These attributes have a significant number of outliers, as indicated by the many points above the upper area of the boxplot.

**Median Income:**

The distribution of median income seems to be broader, indicating more variability in income levels among the households.

**Median House Value:**

Outliers: There are also outliers present in the median house value, though the number is relatively fewer compared to attributes like total rooms or population.

*Taking the target variable as **median_house_value**, we get the **feature importance** as follows:*

**Median Income (0.69):** This has the highest positive correlation with
median_house_value, suggesting it is the most important feature.

**Total Rooms (0.13):** This has a small positive correlation with median_house_value, indicating moderate importance.

**Housing Median Age (0.11)**: This also has a small positive correlation with median_house_value.

**Households (0.066)**: Slight positive correlation, indicating it might have some relevance.

**Total Bedrooms (0.05)**: Very small positive correlation, suggesting low importance.

**Population (-0.025)**: Near-zero correlation, indicating it is not very important.

In [None]:
df['cost_per_sqft'] = df['median_house_value'] / df['total_rooms']

In [None]:
threshold = df['median_house_value'].quantile(0.90)

premium_houses = df[df['median_house_value'] > threshold]
print(premium_houses.shape[0])
premium_houses.head()

In [None]:
# Plotting the distribution of premium houses based on ocean proximity
plt.figure(figsize=(12, 6))
sns.countplot(data=premium_houses, x='ocean_proximity')
plt.title('Distribution of Premium Houses by Ocean Proximity')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=premium_houses, x='median_income', bins=10)
plt.title('Distribution of Median Income for Premium Houses')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=premium_houses, x='cost_per_sqft', bins=100)
plt.title('Distribution of Cost per Square Foot for Premium Houses')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=np.log1p(premium_houses['cost_per_sqft']), bins=30)
plt.title('Log-Scaled Distribution of Cost per Square Foot for Premium Houses')
plt.xlabel('Log(Cost per Square Foot)')
plt.show()

### Task 12

In [None]:
# Task 12
def visualize_col(col):
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    sns.kdeplot(df[col], ax=axes[0])
    axes[0].set_title(f'KDE of {col}')

    sns.boxplot(x=df[col], ax=axes[1])
    axes[1].set_title(f'Boxplot of {col}')

    sns.scatterplot(x=df[col], y=df['median_house_value'], ax=axes[2])
    axes[2].set_title(f'Scatterplot of {col} vs median_house_value')

    plt.tight_layout()
    plt.show()

In [None]:
visualize_col('total_bedrooms')

### Task 13

In [None]:
# Task 13
def visualize_outlier(col):
    mean = df[col].mean()
    std_dev = df[col].std()

    lower_bound = mean - 2 * std_dev
    upper_bound = mean + 2 * std_dev

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    print(f'Total number of outliers in {col}: {outliers.shape[0]}')

    plt.figure(figsize=(12, 5))
    sns.histplot(df[col], bins=45)

    plt.axvspan(df[col].min(), lower_bound, color='red', alpha=0.3)
    plt.axvspan(upper_bound, df[col].max(), color='red', alpha=0.3)

    plt.xlabel(col)
    plt.ylabel('Frequency')

    plt.show()


In [None]:
visualize_outlier('median_house_value')

### Task 14

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numerical_features=df.select_dtypes(include=[np.number]).columns
print(numerical_features)

def scaler(df, numerical_features, scaler):
    scaled_df = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)
    return scaled_df

In [None]:
# Standard Scaler
df_standard_scaled = scaler(df, numerical_features, StandardScaler())
df_standard_scaled.head()

In [None]:
print("StandardScaler ranges:")
df_standard_scaled.describe().loc[['min', 'max']]

In [None]:
# MinMax Scaler
df_minmax_scaled = scaler(df, numerical_features, MinMaxScaler())
df_minmax_scaled.head()

In [None]:
print("MinMaxScaler ranges:")
df_minmax_scaled.describe().loc[['min', 'max']]

In [None]:
from sklearn.preprocessing import RobustScaler,QuantileTransformer,PowerTransformer

# Robust Scaler
df_robust_scaled = scaler(df, numerical_features, RobustScaler())
df_robust_scaled.head()

In [None]:
print("RobustScaler ranges:")
df_robust_scaled.describe().loc[['min', 'max']]

In [None]:
# Quantile Transformer
df_quantile_scaled = scaler(df, numerical_features, QuantileTransformer())
df_quantile_scaled.head()

In [None]:
print("QuantileTransformer ranges:")
df_quantile_scaled.describe().loc[['min', 'max']]

In [None]:
# Power Transformer
df_power_scaled = scaler(df, numerical_features, PowerTransformer())
df_power_scaled.head()

In [None]:
print("PowerTransformer ranges:")
df_power_scaled.describe().loc[['min', 'max']]

**StandardScaler:**
- Centers the data to have a mean of 0 and a standard deviation of 1.
- Useful when the data follows a Gaussian distribution.
- Drawback: Sensitive to outliers.

**MinMaxScaler:**
- Scales the data to a fixed range, usually [0, 1].
- Useful when the data does not follow a Gaussian distribution.
- Drawback: Sensitive to outliers.

**RobustScaler:**
- Scales the data using statistics that are robust to outliers (median and IQR).
- Useful when the data contains outliers.

**QuantileTransformer:**
- Transforms the data to follow a uniform or normal distribution.
- Useful for non-Gaussian data.
- Drawback: Can distort correlations between features.

**PowerTransformer:**
- Applies a power transformation to make the data more Gaussian-like.
- Useful for positive data with skewed distributions.
- Drawback: Requires positive data.
