## Data preparation HW
### Group: 22_LR_JA
### Author: Yauheni Chekan

In [98]:
import warnings

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler

pd.set_option("display.float_format", "{:.4f}".format)

warnings.filterwarnings("ignore")

In [99]:
df = pd.read_csv("datasets/Building_Permits.csv")

In [100]:
df.head(5)

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID
0,201505065519,4,sign - erect,05/06/2015,326,23,140,,Ellis,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.785719256680785, -122.40852313194863)",1380611233945
1,201604195146,4,sign - erect,04/19/2016,306,7,440,,Geary,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.78733980600732, -122.41063199757738)",1420164406718
2,201605278609,3,additions alterations or repairs,05/27/2016,595,203,1647,,Pacific,Av,...,1.0,constr type 1,1.0,constr type 1,,3.0,Russian Hill,94109.0,"(37.7946573324287, -122.42232562979227)",1424856504716
3,201611072166,8,otc alterations permit,11/07/2016,156,11,1230,,Pacific,Av,...,5.0,wood frame (5),5.0,wood frame (5),,3.0,Nob Hill,94109.0,"(37.79595867909168, -122.41557405519474)",1443574295566
4,201611283529,6,demolitions,11/28/2016,342,1,950,,Market,St,...,3.0,constr type 3,,,,6.0,Tenderloin,94102.0,"(37.78315261897309, -122.40950883997789)",144548169992


### Describe the dataframe:

In [101]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Permit Type,198900.0,7.5223,1.4575,1.0,8.0,8.0,8.0,8.0
Street Number,198900.0,1121.7289,1135.7689,0.0,235.0,710.0,1700.0,8400.0
Unit,29479.0,78.5172,326.9813,0.0,0.0,0.0,1.0,6004.0
Number of Existing Stories,156116.0,5.7058,8.6135,0.0,2.0,3.0,4.0,78.0
Number of Proposed Stories,156032.0,5.745,8.6133,0.0,2.0,3.0,4.0,78.0
Estimated Cost,160834.0,168955.4433,3630385.9537,1.0,3300.0,11000.0,35000.0,537958646.0
Revised Cost,192834.0,132856.1865,3584902.592,0.0,1.0,7000.0,28707.5,780500000.0
Existing Units,147362.0,15.6662,74.4763,0.0,1.0,1.0,4.0,1907.0
Proposed Units,147989.0,16.511,75.2204,0.0,1.0,2.0,4.0,1911.0
Plansets,161591.0,1.2747,22.4073,0.0,0.0,2.0,2.0,9000.0


### Missing values for "Existing Use" column can be populated with the mode.

Mode for 'Existing Use' column is `1 family dwelling`

In [102]:
print(f"Mode for 'Existing Use' column: '{df['Existing Use'].mode().to_numpy()[0]}'")


Mode for 'Existing Use' column: '1 family dwelling'


### Define the amount of missing values in "Current Status" field:

Amount of missing values in 'Current Status' field: 0 (0.00%)

In [103]:
missing_values = df["Current Status"].isna().sum()
missing_percentage = (missing_values / len(df)) * 100
print(f"Amount of missing values in 'Current Status' field: {missing_values} ({missing_percentage:.2f}%)")


Amount of missing values in 'Current Status' field: 0 (0.00%)


### Boxplot can be constructed for a numerical feature.

Features that can be plotted:
- "Revised Cost": True
- "Proposed Units": True
- "Existing Construction Type": True

In [104]:
boxplot_features = ["Revised Cost", "Existing Construction Type", "Proposed Units"]

for feature in boxplot_features:
    print(f"Feature: {feature}, Type: {df[feature].dtype}, Can be plotted: {df[feature].dtype == 'float64' or df[feature].dtype == 'int64'}")

Feature: Revised Cost, Type: float64, Can be plotted: True
Feature: Existing Construction Type, Type: float64, Can be plotted: True
Feature: Proposed Units, Type: float64, Can be plotted: True


### One-hot encoding can be applied to categorical features with a relatively small number of unique values. (will use 5 as a threshold)

Features that can be one-hot encoded:
- "Description": Can be one-hot encoded: False
- "Street Name": Can be one-hot encoded: False
- "Existing Construction Type Description": Can be one-hot encoded: True

In [105]:
ohe_features = ["Description", "Street Name", "Existing Construction Type Description"]

for feature in ohe_features:
    ohe_thld = 5
    unique_values = df[feature].nunique()
    print(f"Feature: {feature}, Unique values: {unique_values}, Can be one-hot encoded: {unique_values <= ohe_thld}")


Feature: Description, Unique values: 134272, Can be one-hot encoded: False
Feature: Street Name, Unique values: 1704, Can be one-hot encoded: False
Feature: Existing Construction Type Description, Unique values: 5, Can be one-hot encoded: True


### Missing values for "Proposed Construction Type" feature can be filled with median.

Median for "Proposed Construction Type" is: `5.0`

In [106]:
median = df["Proposed Construction Type"].median()

print(f"Median value for 'Proposed Construction Type' feature is: {median}")

Median value for 'Proposed Construction Type' feature is: 5.0


### Let's apply the Robust Scaler to "Proposed Units" feature and compare the changes in median value

Original median: `2.0`

RobustScaler transformed median: `0.0`

In [107]:
# Create a RobustScaler instance
robust_scaler = RobustScaler()

proposed_units = df["Proposed Units"].to_numpy().reshape(-1, 1)
proposed_units_scaled = robust_scaler.fit_transform(proposed_units)
# Convert back to a pandas Series for easier analysis
proposed_units_scaled_series = pd.Series(proposed_units_scaled.flatten())
scaled_median = proposed_units_scaled_series.median()

print(f"Original median of 'Proposed Units': {df['Proposed Units'].median()}")
print(f"Median after RobustScaler transformation: {scaled_median}")



Original median of 'Proposed Units': 2.0
Median after RobustScaler transformation: 0.0


### Let's apply the MinMax Scaler to "Existing Construction Type" feature and compare the changes in median value

Original median: `5.0`

MinMaxScaler transformed median: `1.0`

In [108]:
min_max_scaler = MinMaxScaler()

existing_construction_type = df["Existing Construction Type"].to_numpy().reshape(-1, 1)
existing_construction_type_scaled = min_max_scaler.fit_transform(existing_construction_type)
# Convert back to a pandas Series for easier analysis
existing_construction_type_scaled_series = pd.Series(existing_construction_type_scaled.flatten())

original_median = df["Existing Construction Type"].median()
scaled_median = existing_construction_type_scaled_series.median()

print(f"Original median of 'Existing Construction Type': {original_median}")
print(f"Median after MinMaxScaler transformation: {scaled_median}")


Original median of 'Existing Construction Type': 5.0
Median after MinMaxScaler transformation: 1.0


### "Street Name" feature contains `1704` unique values, so it is not good to transform it using OHE method.

In [109]:
unique_street_names = len(df["Street Name"].unique())
print(f"Number of unique street names: {unique_street_names}")


Number of unique street names: 1704


#### The best strategy for dealing with missing values in "Street Number Suffix" column will be completely dropping the column.

*Rationale*:
- The column has almost no information (only 1.11% of data is present)
- Any imputation would be extremely unreliable
- The column likely provides very little predictive value
- Keeping it would mostly introduce noise

In [110]:
missing_values = df["Street Number Suffix"].isna().sum()
missing_percent = (missing_values / len(df)) * 100

print(f"'Street Number Suffix' column has {missing_percent:.2f}% of missing data.")

'Street Number Suffix' column has 98.89% of missing data.


#### The "Existing Use" feature has 93 unique values which do not seem to have any natural ordering, so we should not apply LabelEncoder to it.

In [111]:
unique_uses = df['Existing Use'].unique()
print(unique_uses)

['tourist hotel/motel' 'retail sales' '1 family dwelling' 'apartments' nan
 '2 family dwelling' 'church' 'storage shed' 'office' 'vacant lot'
 'food/beverage hndlng' 'residential hotel' 'filling/service stn'
 'workshop commercial' 'clinics-medic/dental' 'misc group residns.'
 'hospital' 'club' 'barber/beauty salon' 'warehouse,no frnitur' 'school'
 'artist live/work' 'manufacturing' 'garment shops' 'public assmbly other'
 'auto repairs' 'lending institution' 'museum' 'warehouse, furniture'
 'prkng garage/private' 'antenna' 'health studios & gym' 'massage parlor'
 'printing plant' 'parking lot' 'workshop residential' 'power plant'
 'tower' 'mortuary' 'animal sale or care' 'laundry/laundromat' 'nite club'
 'paint store' 'recreation bldg' 'theater' 'prkng garage/public' 'sign'
 'phone xchnge/equip' 'dance hall' 'sfpd or sffd station' 'storage tanks'
 'muni carbarn' 'stadium' 'automobile sales' 'fence/retaining wall'
 'radio & tv stations' 'social care facility' 'amusement center'
 'day car