In [16]:
import pandas as pd

# Load the dataset (you can adjust the file path)
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
dataset = pd.read_csv(file_path)

# Drop columns with high percentage of missing values (like 'YR_RMDL')
dataset_cleaned = dataset.drop(columns=['YR_RMDL'])
dataset_cleaned = dataset_cleaned.dropna(subset=['PRICE'])
dataset_cleaned = dataset_cleaned[(dataset_cleaned['GBA'] > 0) & (dataset_cleaned['GBA'].notna()) & (dataset_cleaned['AC'].notna()) & (dataset_cleaned['PRICE'] > 0)]

# Encode AC Y/N to 1 and 0
dataset_cleaned['AC'] = dataset_cleaned['AC'].map({'Y': True, 'N': False})

# Drop rows with missing values in 'ROOMS', 'BEDRM', 'KITCHENS', and 'FIREPLACES'
columns_to_check = ['ROOMS', 'BEDRM', 'KITCHENS', 'FIREPLACES', 'AC', 'AYB', 'STORIES']
dataset_cleaned = dataset_cleaned.dropna(subset=columns_to_check)

# For categorical columns, fill missing values with the mode (e.g., for 'STYLE', 'GRADE', etc.)
dataset_cleaned['STYLE'].fillna(dataset_cleaned['STYLE'].mode()[0], inplace=True)
dataset_cleaned['GRADE'].fillna(dataset_cleaned['GRADE'].mode()[0], inplace=True)
dataset_cleaned['STRUCT'].fillna(dataset_cleaned['STRUCT'].mode()[0], inplace=True)

# You can continue this process for other columns as needed

# Save the cleaned dataset if needed
# dataset_cleaned.to_csv('cleaned_dataset.csv', index=False)

# Check if there are any remaining missing values
print(dataset_cleaned.isnull().sum())
data = dataset_cleaned

SSL                    0
BATHRM                 0
HF_BATHRM              0
HEAT                   0
HEAT_D                 0
AC                    20
NUM_UNITS              0
ROOMS                  0
BEDRM                  0
AYB                   11
EYB                    0
STORIES              120
SALEDATE               0
PRICE                  0
QUALIFIED              0
SALE_NUM               0
GBA                    0
BLDG_NUM               0
STYLE                  0
STYLE_D                0
STRUCT                 0
STRUCT_D               0
GRADE                  0
GRADE_D                0
CNDTN                  0
CNDTN_D                0
EXTWALL                0
EXTWALL_D              0
ROOF                   0
ROOF_D                 0
INTWALL                0
INTWALL_D              0
KITCHENS               0
FIREPLACES             0
USECODE                0
LANDAREA               0
GIS_LAST_MOD_DTTM      0
OBJECTID               0
dtype: int64


### Drop unnessary column

In [7]:
data.drop(columns=['SSL', 'BLDG_NUM', 'SALE_NUM', 'HEAT_D', 'STYLE_D', 'STRUCT_D', 'GRADE_D', 'CNDTN_D', 'EXTWALL_D', 'ROOF_D', 'INTWALL_D', 'GIS_LAST_MOD_DTTM', 'OBJECTID'], inplace=True)
data.head()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,EYB,STORIES,...,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA
0,4.0,1.0,8.0,True,1.0,12.0,6.0,1911.0,1989,3.75,...,6.0,8.0,4.0,20.0,11.0,6.0,1.0,6.0,11,2104
1,3.0,1.0,1.0,True,2.0,13.0,5.0,1912.0,1978,3.0,...,6.0,6.0,4.0,14.0,2.0,6.0,2.0,3.0,24,936
2,3.0,1.0,7.0,True,2.0,6.0,4.0,1910.0,1993,3.0,...,7.0,6.0,4.0,14.0,6.0,6.0,2.0,2.0,24,936
3,3.0,1.0,7.0,True,2.0,11.0,4.0,1912.0,1978,3.0,...,6.0,6.0,4.0,14.0,6.0,6.0,2.0,2.0,24,988
4,4.0,1.0,1.0,True,3.0,11.0,5.0,1912.0,1993,3.0,...,7.0,6.0,5.0,14.0,2.0,6.0,3.0,4.0,24,1674


### Modify date column

In [8]:
data['SALEDATE'] = pd.to_datetime(data['SALEDATE'])

# Remove the time part, keeping only the date (this will keep Year, Month, Day)
data['SALEDATE'] = data['SALEDATE'].dt.date

# Optionally, you can extract specific components from the date for model training
data['SALE_YEAR'] = pd.DatetimeIndex(data['SALEDATE']).year
data['SALE_MONTH'] = pd.DatetimeIndex(data['SALEDATE']).month

# Drop the original 'SALEDATE' column if not needed
data = data.drop(columns=['SALEDATE'])

In [9]:
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 62320 entries, 0 to 109033
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   BATHRM      62320 non-null  float64
 1   HF_BATHRM   62320 non-null  float64
 2   HEAT        62320 non-null  float64
 3   AC          62299 non-null  object 
 4   NUM_UNITS   62320 non-null  float64
 5   ROOMS       62320 non-null  float64
 6   BEDRM       62320 non-null  float64
 7   AYB         62309 non-null  float64
 8   EYB         62320 non-null  int64  
 9   STORIES     62192 non-null  float64
 10  PRICE       62320 non-null  float64
 11  QUALIFIED   62320 non-null  object 
 12  GBA         62320 non-null  int64  
 13  STYLE       62320 non-null  float64
 14  STRUCT      62320 non-null  float64
 15  GRADE       62320 non-null  float64
 16  CNDTN       62320 non-null  float64
 17  EXTWALL     62320 non-null  float64
 18  ROOF        62320 non-null  float64
 19  INTWALL     62320 non-null  f

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,EYB,STORIES,...,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA,SALE_YEAR,SALE_MONTH
0,4.0,1.0,8.0,True,1.0,12.0,6.0,1911.0,1989,3.75,...,4.0,20.0,11.0,6.0,1.0,6.0,11,2104,2019,8
1,3.0,1.0,1.0,True,2.0,13.0,5.0,1912.0,1978,3.0,...,4.0,14.0,2.0,6.0,2.0,3.0,24,936,1999,8
2,3.0,1.0,7.0,True,2.0,6.0,4.0,1910.0,1993,3.0,...,4.0,14.0,6.0,6.0,2.0,2.0,24,936,2019,7
3,3.0,1.0,7.0,True,2.0,11.0,4.0,1912.0,1978,3.0,...,4.0,14.0,6.0,6.0,2.0,2.0,24,988,2021,10
4,4.0,1.0,1.0,True,3.0,11.0,5.0,1912.0,1993,3.0,...,5.0,14.0,2.0,6.0,3.0,4.0,24,1674,2023,4


In [10]:
data.columns

Index(['BATHRM', 'HF_BATHRM', 'HEAT', 'AC', 'NUM_UNITS', 'ROOMS', 'BEDRM',
       'AYB', 'EYB', 'STORIES', 'PRICE', 'QUALIFIED', 'GBA', 'STYLE', 'STRUCT',
       'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'KITCHENS',
       'FIREPLACES', 'USECODE', 'LANDAREA', 'SALE_YEAR', 'SALE_MONTH'],
      dtype='object')

### Feature Selection and Engineering

In [11]:
import numpy as np

# Feature Selection
numerical_features = ['BATHRM', 'HF_BATHRM', 'ROOMS', 'BEDRM', 'AYB', 'EYB', 'STORIES', 'GBA', 'KITCHENS', 'FIREPLACES', 'LANDAREA', 'SALE_YEAR', 'SALE_MONTH']
categorical_features = ['HEAT', 'AC', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'USECODE']

# Feature Engineering
data['AGE'] = data['SALE_YEAR'] - data['AYB']
data['TOTAL_ROOMS'] = data['ROOMS'] + data['BATHRM'] + data['HF_BATHRM']
data['PRICE_PER_SQFT'] = data['PRICE'] / data['GBA']

# Log transform of the target variable (often helps with price predictions)
data['LOG_PRICE'] = np.log(data['PRICE'])

# Update features list
numerical_features += ['AGE', 'TOTAL_ROOMS', 'PRICE_PER_SQFT']

In [12]:
data['PRICE_PER_SQFT']

0         484.109387
1         241.016652
2         843.253968
3         737.463127
4         840.866290
             ...    
109026    355.713651
109028    170.000000
109030    290.949336
109031    247.579530
109033    577.888134
Name: PRICE_PER_SQFT, Length: 62320, dtype: float64

In [13]:
data.to_csv('processed_dataset.csv', index=False)

### Categorical Variable Encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# One-hot encode categorical variables
onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
ct = ColumnTransformer([("onehot", onehot, categorical_features)], remainder='passthrough')
   
X = data[numerical_features + categorical_features]
y = data['LOG_PRICE']  # Using log-transformed price

X_encoded = ct.fit_transform(X)
feature_names = (ct.named_transformers_['onehot']
                    .get_feature_names(categorical_features)
                    .tolist() + numerical_features)
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)



AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'

### Feature Scaling

In [19]:
import numpy as np

# Check for infinite values
inf_columns = X_encoded.columns[np.isinf(X_encoded).any()].tolist()
print("Columns with infinite values:", inf_columns)

# Check for very large values
large_columns = X_encoded.columns[(np.abs(X_encoded) > 1e300).any()].tolist()
print("Columns with very large values:", large_columns)

# Display some statistics for these columns
if inf_columns or large_columns:
    problem_columns = set(inf_columns + large_columns)
    for col in problem_columns:
        print(f"\nStatistics for {col}:")
        print(X_encoded[col].describe())

Columns with infinite values: ['PRICE_PER_SQFT']
Columns with very large values: ['PRICE_PER_SQFT']

Statistics for PRICE_PER_SQFT:
count    9.613400e+04
mean              inf
std               NaN
min      0.000000e+00
25%      0.000000e+00
50%      2.185000e+02
75%      4.779412e+02
max               inf
Name: PRICE_PER_SQFT, dtype: float64


In [20]:
# Method 1: Replace infinite values with NaN and then with a large number
X_encoded = X_encoded.replace([np.inf, -np.inf], np.nan)
X_encoded = X_encoded.fillna(X_encoded.max() * 2)  # or another suitable large value

# Method 2: Clip values to a reasonable range
# X_encoded = X_encoded.clip(lower=-1e300, upper=1e300)

# Method 3: Remove rows with problematic values (use cautiously as it may remove a lot of data)
# X_encoded = X_encoded.replace([np.inf, -np.inf], np.nan).dropna()

# After handling, check again for any remaining issues
print("Columns with infinite values after handling:", X_encoded.columns[np.isinf(X_encoded).any()].tolist())
print("Columns with very large values after handling:", X_encoded.columns[(np.abs(X_encoded) > 1e300).any()].tolist())

Columns with infinite values after handling: []
Columns with very large values after handling: []


In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_scaled = pd.DataFrame(X_scaled, columns=X_encoded.columns)

print("Standardization completed successfully.")
print(X_scaled.describe())

Standardization completed successfully.
           HEAT_0.0      HEAT_1.0      HEAT_2.0      HEAT_3.0      HEAT_4.0  \
count  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04   
mean  -2.917762e-15 -2.853788e-14 -7.660121e-15 -2.532476e-15  2.900936e-15   
std    1.000005e+00  1.000005e+00  1.000005e+00  1.000005e+00  1.000005e+00   
min   -1.901579e-02 -7.560838e-01 -2.631416e-02 -4.081089e-02 -2.551605e-02   
25%   -1.901579e-02 -7.560838e-01 -2.631416e-02 -4.081089e-02 -2.551605e-02   
50%   -1.901579e-02 -7.560838e-01 -2.631416e-02 -4.081089e-02 -2.551605e-02   
75%   -1.901579e-02  1.322605e+00 -2.631416e-02 -4.081089e-02 -2.551605e-02   
max    5.258789e+01  1.322605e+00  3.800236e+01  2.450326e+01  3.919103e+01   

           HEAT_5.0      HEAT_6.0      HEAT_7.0      HEAT_8.0      HEAT_9.0  \
count  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04   
mean  -8.822975e-16 -2.523788e-15 -4.037938e-14 -4.417981e-14  3.553850e-15   
std    1.00

In [23]:
# First, let's check if 'PRICE_PER_SQFT' is in X_encoded
if 'PRICE_PER_SQFT' in X_encoded.columns:
    print("Current statistics for PRICE_PER_SQFT:")
    print(X_encoded['PRICE_PER_SQFT'].describe())
    
    # Remove the current 'PRICE_PER_SQFT' column
    X_encoded = X_encoded.drop('PRICE_PER_SQFT', axis=1)

# Now, let's recalculate PRICE_PER_SQFT using the original data DataFrame
if 'PRICE' in data.columns and 'GBA' in data.columns:
    data['PRICE_PER_SQFT'] = data['PRICE'] / data['GBA'].replace(0, np.nan)
    data['PRICE_PER_SQFT'] = data['PRICE_PER_SQFT'].fillna(data['PRICE_PER_SQFT'].median())
    
    # Add the recalculated PRICE_PER_SQFT to X_encoded
    X_encoded['PRICE_PER_SQFT'] = data['PRICE_PER_SQFT']
    
    print("\nRecalculated statistics for PRICE_PER_SQFT:")
    print(X_encoded['PRICE_PER_SQFT'].describe())
else:
    print("Error: 'PRICE' or 'GBA' column not found in the original data DataFrame")

# Check for infinite or very large values
inf_check = np.isinf(X_encoded['PRICE_PER_SQFT']).sum()
large_check = (np.abs(X_encoded['PRICE_PER_SQFT']) > 1e300).sum()

print(f"\nNumber of infinite values in PRICE_PER_SQFT: {inf_check}")
print(f"Number of very large values (>1e300) in PRICE_PER_SQFT: {large_check}")

# If there are still issues, we can apply a cap
if inf_check > 0 or large_check > 0:
    upper_limit = X_encoded['PRICE_PER_SQFT'].quantile(0.99)  # 99th percentile
    X_encoded['PRICE_PER_SQFT'] = X_encoded['PRICE_PER_SQFT'].clip(upper=upper_limit)
    
    print("\nAfter applying cap:")
    print(X_encoded['PRICE_PER_SQFT'].describe())

Current statistics for PRICE_PER_SQFT:
count    96827.000000
mean      1244.313753
std       7032.506619
min          0.000000
25%          0.000000
50%        222.148148
75%        484.400067
max      52154.195011
Name: PRICE_PER_SQFT, dtype: float64

Recalculated statistics for PRICE_PER_SQFT:
count    86605.000000
mean       287.148722
std        345.125259
min          0.000000
25%          0.000000
50%        219.594595
75%        491.129032
max      26077.097506
Name: PRICE_PER_SQFT, dtype: float64

Number of infinite values in PRICE_PER_SQFT: 0
Number of very large values (>1e300) in PRICE_PER_SQFT: 0


In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_scaled = pd.DataFrame(X_scaled, columns=X_encoded.columns)

print("Standardization completed successfully.")
print(X_scaled.describe())

Standardization completed successfully.
           HEAT_0.0      HEAT_1.0      HEAT_2.0      HEAT_3.0      HEAT_4.0  \
count  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04   
mean  -2.917762e-15 -2.853788e-14 -7.660121e-15 -2.532476e-15  2.900936e-15   
std    1.000005e+00  1.000005e+00  1.000005e+00  1.000005e+00  1.000005e+00   
min   -1.901579e-02 -7.560838e-01 -2.631416e-02 -4.081089e-02 -2.551605e-02   
25%   -1.901579e-02 -7.560838e-01 -2.631416e-02 -4.081089e-02 -2.551605e-02   
50%   -1.901579e-02 -7.560838e-01 -2.631416e-02 -4.081089e-02 -2.551605e-02   
75%   -1.901579e-02  1.322605e+00 -2.631416e-02 -4.081089e-02 -2.551605e-02   
max    5.258789e+01  1.322605e+00  3.800236e+01  2.450326e+01  3.919103e+01   

           HEAT_5.0      HEAT_6.0      HEAT_7.0      HEAT_8.0      HEAT_9.0  \
count  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04  9.682700e+04   
mean  -8.822975e-16 -2.523788e-15 -4.037938e-14 -4.417981e-14  3.553850e-15   
std    1.00

### Data Splitting

In [25]:
from sklearn.model_selection import train_test_split

# Assuming y is your target variable (house prices)
y = data['PRICE']  # Make sure this is the correct column name for your target variable

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (77461, 150)
Testing set shape: (19366, 150)


### Model Training and Evaluation

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

print("Models have been trained successfully.")

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').