In [None]:
import pandas as pd

# Load the dataset (you can adjust the file path)
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
dataset = pd.read_csv(file_path)

# Drop columns with high percentage of missing values (like 'YR_RMDL')
dataset_cleaned = dataset.drop(columns=['YR_RMDL'])

# For numerical columns, fill missing values with the median (e.g., for 'PRICE', 'ROOMS', etc.)
dataset_cleaned['PRICE'].fillna(dataset_cleaned['PRICE'].median(), inplace=True)
dataset_cleaned['ROOMS'].fillna(dataset_cleaned['ROOMS'].median(), inplace=True)
dataset_cleaned['BEDRM'].fillna(dataset_cleaned['BEDRM'].median(), inplace=True)
dataset_cleaned['KITCHENS'].fillna(dataset_cleaned['KITCHENS'].median(), inplace=True)
dataset_cleaned['FIREPLACES'].fillna(dataset_cleaned['FIREPLACES'].median(), inplace=True)

# For categorical columns, fill missing values with the mode (e.g., for 'STYLE', 'GRADE', etc.)
dataset_cleaned['STYLE'].fillna(dataset_cleaned['STYLE'].mode()[0], inplace=True)
dataset_cleaned['GRADE'].fillna(dataset_cleaned['GRADE'].mode()[0], inplace=True)
dataset_cleaned['STRUCT'].fillna(dataset_cleaned['STRUCT'].mode()[0], inplace=True)

# You can continue this process for other columns as needed

# Save the cleaned dataset if needed
dataset_cleaned.to_csv('cleaned_dataset.csv', index=False)

# Check if there are any remaining missing values
print(dataset_cleaned.isnull().sum())

In [3]:
dataset_cleaned.head()

Unnamed: 0,SSL,BATHRM,HF_BATHRM,HEAT,HEAT_D,AC,NUM_UNITS,ROOMS,BEDRM,AYB,...,ROOF,ROOF_D,INTWALL,INTWALL_D,KITCHENS,FIREPLACES,USECODE,LANDAREA,GIS_LAST_MOD_DTTM,OBJECTID
0,0153 0069,4.0,1.0,8.0,Ht Pump,Y,1.0,12.0,6.0,1911.0,...,11.0,Slate,6.0,Hardwood,1.0,6.0,11,2104,2024/10/09 09:16:46+00,134166027
1,0153 0094,3.0,1.0,1.0,Forced Air,Y,2.0,13.0,5.0,1912.0,...,2.0,Built Up,6.0,Hardwood,2.0,3.0,24,936,2024/10/09 09:16:46+00,134166028
2,0153 0095,3.0,1.0,7.0,Warm Cool,Y,2.0,6.0,4.0,1910.0,...,6.0,Metal- Sms,6.0,Hardwood,2.0,2.0,24,936,2024/10/09 09:16:46+00,134166029
3,0153 0096,3.0,1.0,7.0,Warm Cool,Y,2.0,11.0,4.0,1912.0,...,6.0,Metal- Sms,6.0,Hardwood,2.0,2.0,24,988,2024/10/09 09:16:46+00,134166030
4,0153 0100,4.0,1.0,1.0,Forced Air,Y,3.0,11.0,5.0,1912.0,...,2.0,Built Up,6.0,Hardwood,3.0,4.0,24,1674,2024/10/09 09:16:46+00,134166031


## One-Hot Encode Categorical Features

In [4]:
categorical_cols = dataset_cleaned.select_dtypes(include=['object', 'category']).columns

# Apply One-Hot Encoding to the categorical features
data_encoded = pd.get_dummies(dataset_cleaned, columns=categorical_cols)

# Display the encoded data
data_encoded.head()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,NUM_UNITS,ROOMS,BEDRM,AYB,EYB,STORIES,PRICE,...,INTWALL_D_Hardwood/Carp,INTWALL_D_Lt Concrete,INTWALL_D_No Data,INTWALL_D_Parquet,INTWALL_D_Resiliant,INTWALL_D_Terrazo,INTWALL_D_Vinyl Comp,INTWALL_D_Vinyl Sheet,INTWALL_D_Wood Floor,GIS_LAST_MOD_DTTM_2024/10/09 09:16:46+00
0,4.0,1.0,8.0,1.0,12.0,6.0,1911.0,1989,3.75,3275000.0,...,False,False,False,False,False,False,False,False,False,True
1,3.0,1.0,1.0,2.0,13.0,5.0,1912.0,1978,3.0,550000.0,...,False,False,False,False,False,False,False,False,False,True
2,3.0,1.0,7.0,2.0,6.0,4.0,1910.0,1993,3.0,1700000.0,...,False,False,False,False,False,False,False,False,False,True
3,3.0,1.0,7.0,2.0,11.0,4.0,1912.0,1978,3.0,1500000.0,...,False,False,False,False,False,False,False,False,False,True
4,4.0,1.0,1.0,3.0,11.0,5.0,1912.0,1993,3.0,2232500.0,...,False,False,False,False,False,False,False,False,False,True
