## Data Load and Augmentation

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
# from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.utils import resample

# Load the dataset
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
data = pd.read_csv(file_path)

# Identify categorical columns for One-Hot Encoding
categorical_features = ['HEAT', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'USECODE']

# Apply One-Hot Encoding
onehot_encoder = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features)], remainder='passthrough')
data_encoded = onehot_encoder.fit_transform(data)

# Update column names for one-hot encoded features
encoded_feature_names = onehot_encoder.named_transformers_['onehot'].get_feature_names_out(categorical_features)
data_encoded_df = pd.DataFrame(data_encoded, columns=list(encoded_feature_names) + list(data.columns.drop(categorical_features)))

# Handle zero values in certain columns to prevent division errors in interaction features
data_encoded_df['LANDAREA'].replace(0, np.nan, inplace=True)
data_encoded_df['ROOMS'].replace(0, np.nan, inplace=True)

# Create interaction features
data_encoded_df['Rooms_Bathrooms'] = data_encoded_df['ROOMS'] * data_encoded_df['BATHRM']
data_encoded_df['Building_Density'] = data_encoded_df['GBA'] / data_encoded_df['LANDAREA']
data_encoded_df['Bedroom_Room_Ratio'] = data_encoded_df['BEDRM'] / data_encoded_df['ROOMS']

# Fill NaN values after division
data_encoded_df['Building_Density'].fillna(0, inplace=True)
data_encoded_df['Bedroom_Room_Ratio'].fillna(0, inplace=True)

# Convert SALEDATE to datetime format and derive date-based features
data_encoded_df['SALEDATE'] = pd.to_datetime(data_encoded_df['SALEDATE'])
data_encoded_df['Property_Age'] = 2023 - data_encoded_df['AYB']
data_encoded_df['Years_Since_Remodel'] = 2023 - data_encoded_df['YR_RMDL']
data_encoded_df['Years_Between_Built_and_Remodel'] = data_encoded_df['YR_RMDL'] - data_encoded_df['AYB']
data_encoded_df['Sale_Year'] = data_encoded_df['SALEDATE'].dt.year
data_encoded_df['Sale_Month'] = data_encoded_df['SALEDATE'].dt.month

# Define features for clustering (using numeric features only for simplicity)
features_for_clustering = ['ROOMS', 'BATHRM', 'LANDAREA', 'GBA', 'PRICE']
data_cluster = data_encoded_df[features_for_clustering].dropna()

# Preserve original index for later merge
data_cluster = data_cluster.reset_index()  # This adds the original index as a column

# Normalize features for clustering
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_cluster[features_for_clustering])

# Apply K-Means clustering
n_clusters = 5  # Set an appropriate number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
data_cluster['Cluster'] = kmeans.fit_predict(data_scaled)

# Merge cluster labels back to original data
data_encoded_df = data_encoded_df.merge(data_cluster[['index', 'Cluster']], left_index=True, right_on='index', how='left')
data_encoded_df.drop(columns=['index'], inplace=True)

# Remove rows with NaN in Cluster column for SMOTE application
data_with_clusters = data_encoded_df.dropna(subset=['Cluster'])

# Placeholder for augmented data
augmented_data = pd.DataFrame()

# Apply random oversampling within each cluster
for cluster in data_with_clusters['Cluster'].unique():
    # Select data for the current cluster
    cluster_data = data_with_clusters[data_with_clusters['Cluster'] == cluster]
    
    # Set the desired size for oversampling (e.g., double the original size of each cluster)
    target_size = len(cluster_data) * 2
    
    # Perform random oversampling
    cluster_augmented = resample(cluster_data, replace=True, n_samples=target_size, random_state=42)
    
    # Append to augmented data
    augmented_data = pd.concat([augmented_data, cluster_augmented])

# Combine original data with augmented data
final_data = pd.concat([data_encoded_df, augmented_data], ignore_index=True)

# Check final dataset shape and display a sample
print("Original Data Shape (before encoding):", data.shape)
print("Data Encoded Shape:", data_encoded_df.shape)
print("Augmented Data Shape:", augmented_data.shape)
print("Final Data Shape:", final_data.shape)
final_data.head()
final_data.describe()



Original Data Shape (before encoding): (109034, 39)
Data Encoded Shape: (109034, 176)
Augmented Data Shape: (189314, 176)
Final Data Shape: (298348, 176)


Unnamed: 0,ROOMS,LANDAREA,Building_Density,Bedroom_Room_Ratio,Sale_Year,Sale_Month,Cluster
count,296126.0,298310.0,298348.0,298348.0,298348.0,298348.0,283971.0
mean,7.498886,3356.172287,0.742864,0.468994,2005.016424,6.18284,0.888629
std,2.33314,5673.380941,0.459735,0.134558,31.018729,3.537094,0.899906
min,1.0,1.0,0.0,0.0,1900.0,1.0,0.0
25%,6.0,1571.0,0.397614,0.4,2005.0,3.0,0.0
50%,7.0,2313.0,0.635209,0.5,2015.0,6.0,1.0
75%,8.0,4140.0,0.988235,0.5,2020.0,9.0,1.0
max,48.0,942632.0,4.988943,8.0,2024.0,12.0,4.0


## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats

# Assuming final_data is the augmented dataset
data = final_data.copy()

# Step 1: Handle missing values
# Fill missing values in numeric columns with the median
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Fill missing values in categorical columns with the mode
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Check that missing values have been handled
print("Missing values after processing:", data.isnull().sum().sum())

# Step 2: Standardize numeric features
# Standardize numeric features using StandardScaler
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Step 3: Detect and handle outliers
# Use Z-score method to detect outliers; replace values with NaN if Z-score > 3
z_scores = np.abs(stats.zscore(data[numeric_columns]))
data[numeric_columns] = np.where(z_scores > 3, np.nan, data[numeric_columns])
# Fill the NaN values resulting from outliers with the median
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Step 4: Split the dataset
# Assuming 'PRICE' is the target variable
X = data.drop(columns=['PRICE'])
y = data['PRICE']

# Ensure that only numeric columns are included in X for PCA
X_numeric = X.select_dtypes(include=[np.number])

# Split the data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Check the shape of training and test sets
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# Optional Step 5: Feature selection using PCA
# Use PCA to reduce features while retaining 95% of the variance
pca = PCA(n_components=0.99)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Original feature count:", X_train.shape[1])
print("Reduced feature count with PCA:", X_train_pca.shape[1])

Missing values after processing: 0
Training set shape: (238678, 162)
Test set shape: (59670, 162)
Original feature count: 162
Reduced feature count with PCA: 2
