In [None]:
import pandas as pd

# Load the dataset (you can adjust the file path)
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
dataset = pd.read_csv(file_path)

# Drop columns with high percentage of missing values (like 'YR_RMDL')
dataset_cleaned = dataset.drop(columns=['YR_RMDL'])

# For numerical columns, fill missing values with the median (e.g., for 'PRICE', 'ROOMS', etc.)
dataset_cleaned['PRICE'].fillna(dataset_cleaned['PRICE'].median(), inplace=True)
dataset_cleaned['ROOMS'].fillna(dataset_cleaned['ROOMS'].median(), inplace=True)
dataset_cleaned['BEDRM'].fillna(dataset_cleaned['BEDRM'].median(), inplace=True)
dataset_cleaned['KITCHENS'].fillna(dataset_cleaned['KITCHENS'].median(), inplace=True)
dataset_cleaned['FIREPLACES'].fillna(dataset_cleaned['FIREPLACES'].median(), inplace=True)

# For categorical columns, fill missing values with the mode (e.g., for 'STYLE', 'GRADE', etc.)
dataset_cleaned['STYLE'].fillna(dataset_cleaned['STYLE'].mode()[0], inplace=True)
dataset_cleaned['GRADE'].fillna(dataset_cleaned['GRADE'].mode()[0], inplace=True)
dataset_cleaned['STRUCT'].fillna(dataset_cleaned['STRUCT'].mode()[0], inplace=True)

# You can continue this process for other columns as needed

# Save the cleaned dataset if needed
# dataset_cleaned.to_csv('cleaned_dataset.csv', index=False)

# Check if there are any remaining missing values
print(dataset_cleaned.isnull().sum())

In [5]:
dataset_cleaned.drop(columns=['BLDG_NUM', 'SALE_NUM', 'HEAT_D', 'STYLE_D', 'STRUCT_D', 'GRADE_D', 'CNDTN_D', 'EXTWALL_D', 'ROOF_D', 'INTWALL_D', 'GIS_LAST_MOD_DTTM', 'OBJECTID'], inplace=True)
dataset_cleaned.head()

Unnamed: 0,SSL,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,EYB,...,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA
0,0153 0069,4.0,1.0,8.0,Y,1.0,12.0,6.0,1911.0,1989,...,6.0,8.0,4.0,20.0,11.0,6.0,1.0,6.0,11,2104
1,0153 0094,3.0,1.0,1.0,Y,2.0,13.0,5.0,1912.0,1978,...,6.0,6.0,4.0,14.0,2.0,6.0,2.0,3.0,24,936
2,0153 0095,3.0,1.0,7.0,Y,2.0,6.0,4.0,1910.0,1993,...,7.0,6.0,4.0,14.0,6.0,6.0,2.0,2.0,24,936
3,0153 0096,3.0,1.0,7.0,Y,2.0,11.0,4.0,1912.0,1978,...,6.0,6.0,4.0,14.0,6.0,6.0,2.0,2.0,24,988
4,0153 0100,4.0,1.0,1.0,Y,3.0,11.0,5.0,1912.0,1993,...,7.0,6.0,5.0,14.0,2.0,6.0,3.0,4.0,24,1674


In [6]:
dataset_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109034 entries, 0 to 109033
Data columns (total 26 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   SSL         109034 non-null  object 
 1   BATHRM      107227 non-null  float64
 2   HF_BATHRM   107227 non-null  float64
 3   HEAT        107227 non-null  float64
 4   AC          107227 non-null  object 
 5   NUM_UNITS   107227 non-null  float64
 6   ROOMS       109034 non-null  float64
 7   BEDRM       109034 non-null  float64
 8   AYB         109010 non-null  float64
 9   EYB         109034 non-null  int64  
 10  STORIES     107059 non-null  float64
 11  SALEDATE    109034 non-null  object 
 12  PRICE       109034 non-null  float64
 13  QUALIFIED   109034 non-null  object 
 14  GBA         109034 non-null  int64  
 15  STYLE       109034 non-null  float64
 16  STRUCT      109034 non-null  float64
 17  GRADE       109034 non-null  float64
 18  CNDTN       107227 non-null  float64
 19  EX

In [7]:
dataset_cleaned.head()

Unnamed: 0,SSL,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,EYB,...,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA
0,0153 0069,4.0,1.0,8.0,Y,1.0,12.0,6.0,1911.0,1989,...,6.0,8.0,4.0,20.0,11.0,6.0,1.0,6.0,11,2104
1,0153 0094,3.0,1.0,1.0,Y,2.0,13.0,5.0,1912.0,1978,...,6.0,6.0,4.0,14.0,2.0,6.0,2.0,3.0,24,936
2,0153 0095,3.0,1.0,7.0,Y,2.0,6.0,4.0,1910.0,1993,...,7.0,6.0,4.0,14.0,6.0,6.0,2.0,2.0,24,936
3,0153 0096,3.0,1.0,7.0,Y,2.0,11.0,4.0,1912.0,1978,...,6.0,6.0,4.0,14.0,6.0,6.0,2.0,2.0,24,988
4,0153 0100,4.0,1.0,1.0,Y,3.0,11.0,5.0,1912.0,1993,...,7.0,6.0,5.0,14.0,2.0,6.0,3.0,4.0,24,1674


## One-Hot Encode Categorical Features

In [8]:
categorical_cols = dataset_cleaned.select_dtypes(include=['object', 'category']).columns

# Apply One-Hot Encoding to the categorical features
data_encoded = pd.get_dummies(dataset_cleaned, columns=categorical_cols)

# Display the encoded data
data_encoded.head()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,NUM_UNITS,ROOMS,BEDRM,AYB,EYB,STORIES,PRICE,...,SALEDATE_2024/09/19 04:00:00+00,SALEDATE_2024/09/20 04:00:00+00,SALEDATE_2024/09/23 04:00:00+00,SALEDATE_2024/09/24 04:00:00+00,SALEDATE_2024/09/25 04:00:00+00,SALEDATE_2024/09/26 04:00:00+00,SALEDATE_2024/09/27 04:00:00+00,SALEDATE_2024/09/30 04:00:00+00,QUALIFIED_Q,QUALIFIED_U
0,4.0,1.0,8.0,1.0,12.0,6.0,1911.0,1989,3.75,3275000.0,...,False,False,False,False,False,False,False,False,True,False
1,3.0,1.0,1.0,2.0,13.0,5.0,1912.0,1978,3.0,550000.0,...,False,False,False,False,False,False,False,False,False,True
2,3.0,1.0,7.0,2.0,6.0,4.0,1910.0,1993,3.0,1700000.0,...,False,False,False,False,False,False,False,False,True,False
3,3.0,1.0,7.0,2.0,11.0,4.0,1912.0,1978,3.0,1500000.0,...,False,False,False,False,False,False,False,False,True,False
4,4.0,1.0,1.0,3.0,11.0,5.0,1912.0,1993,3.0,2232500.0,...,False,False,False,False,False,False,False,False,True,False
