## Import Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Data Load and Augmentation

In [None]:
# Load the dataset
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
data = pd.read_csv(file_path)

# Identify categorical columns for One-Hot Encoding
categorical_features = ['HEAT', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'USECODE']

# Apply One-Hot Encoding
onehot_encoder = ColumnTransformer([("onehot", OneHotEncoder(sparse_output=False, handle_unknown='ignore', dtype=int), categorical_features)], remainder='passthrough')
data_encoded = onehot_encoder.fit_transform(data)

# Update column names for one-hot encoded features
encoded_feature_names = onehot_encoder.named_transformers_['onehot'].get_feature_names_out(categorical_features)
data_encoded_df = pd.DataFrame(data_encoded, columns=list(encoded_feature_names) + list(data.columns.drop(categorical_features)))

# Handle zero values in certain columns to prevent division errors in interaction features
data_encoded_df = data_encoded_df.replace({'LANDAREA': {0: np.nan}, 'ROOMS': {0: np.nan}})

# Create interaction features
data_encoded_df['Rooms_Bathrooms'] = data_encoded_df['ROOMS'] * data_encoded_df['BATHRM']
data_encoded_df['Building_Density'] = data_encoded_df['GBA'] / data_encoded_df['LANDAREA']
data_encoded_df['Bedroom_Room_Ratio'] = data_encoded_df['BEDRM'] / data_encoded_df['ROOMS']

# Fill NaN values after division
data_encoded_df = data_encoded_df.fillna({'Building_Density': 0, 'Bedroom_Room_Ratio': 0})

# Handle invalid values
data_encoded_df = data_encoded_df[
    (data_encoded_df['YR_RMDL'] >= 1000) & (data_encoded_df['YR_RMDL'] <= 2023) &
    (data_encoded_df['AYB'] >= 1000) & (data_encoded_df['AYB'] <= 2023) &
    (data_encoded_df['EYB'] >= 1000) & (data_encoded_df['EYB'] <= 2023) &
    (data_encoded_df['GBA'] > 0) & (data_encoded_df['LANDAREA'] > 0) 
]


# Convert SALEDATE to datetime format and derive date-based features
data_encoded_df['SALEDATE'] = pd.to_datetime(data_encoded_df['SALEDATE'])
data_encoded_df['Property_Age'] = 2023 - data_encoded_df['AYB']
data_encoded_df['Years_Since_Remodel'] = 2023 - data_encoded_df['YR_RMDL']
data_encoded_df['Years_Between_Built_and_Remodel'] = data_encoded_df['YR_RMDL'] - data_encoded_df['AYB']
data_encoded_df['Sale_Year'] = data_encoded_df['SALEDATE'].dt.year
data_encoded_df['Sale_Month'] = data_encoded_df['SALEDATE'].dt.month

### Expand data by clustering

In [None]:
# Define features for clustering (using numeric features only for simplicity)
features_for_clustering = ['ROOMS', 'BATHRM', 'LANDAREA', 'GBA', 'PRICE']
data_cluster = data_encoded_df[features_for_clustering].dropna()

# Preserve original index for later merge
data_cluster = data_cluster.reset_index()  # This adds the original index as a column

# Normalize features for clustering
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_cluster[features_for_clustering])

# Apply K-Means clustering
n_clusters = 5  # Set an appropriate number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
data_cluster['Cluster'] = kmeans.fit_predict(data_scaled)

# Merge cluster labels back to original data
data_encoded_df = data_encoded_df.merge(data_cluster[['index', 'Cluster']], left_index=True, right_on='index', how='left')
data_encoded_df.drop(columns=['index'], inplace=True)

# Remove rows with NaN in Cluster column for SMOTE application
data_with_clusters = data_encoded_df.dropna(subset=['Cluster'])

# Placeholder for augmented data
augmented_data = pd.DataFrame()

# Apply random oversampling within each cluster
for cluster in data_with_clusters['Cluster'].unique():
    # Select data for the current cluster
    cluster_data = data_with_clusters[data_with_clusters['Cluster'] == cluster]
    
    # Set the desired size for oversampling (e.g., double the original size of each cluster)
    target_size = len(cluster_data) * 2
    
    # Perform random oversampling
    cluster_augmented = resample(cluster_data, replace=True, n_samples=target_size, random_state=42)
    
    # Append to augmented data
    augmented_data = pd.concat([augmented_data, cluster_augmented])

# Combine original data with augmented data
final_data = pd.concat([data_encoded_df, augmented_data], ignore_index=True)

# Check final dataset shape and display a sample
print("Original Data Shape (before encoding):", data.shape)
print("Data Encoded Shape:", data_encoded_df.shape)
print("Augmented Data Shape:", augmented_data.shape)
print("Final Data Shape:", final_data.shape)
final_data.head()
final_data.describe()

## ---  Below Unused ---

## Data Preprocessing

In [None]:
data = final_data.copy()

# Step 1: Handle missing values
# Fill missing values in numeric columns with the median
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Fill missing values in categorical columns with the mode
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Check that missing values have been handled
print("Missing values after processing:", data.isnull().sum().sum())

# Step 2: Standardize numeric features
# Standardize numeric features using StandardScaler
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Step 3: Detect and handle outliers
# # Use Z-score method to detect outliers; replace values with NaN if Z-score > 3
# z_scores = np.abs(stats.zscore(data[numeric_columns]))
# data[numeric_columns] = np.where(z_scores > 3, np.nan, data[numeric_columns])
# # Fill the NaN values resulting from outliers with the median
# data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Step 4: Split the dataset
# Assuming 'PRICE' is the target variable
X = data.drop(columns=['PRICE'])
y = data['PRICE']

# Ensure that only numeric columns are included in X for model training
X_numeric = X.select_dtypes(include=[np.number])

# Split the data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Check the shape of training and test sets
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


## Handling Problematic Data

In [None]:
# Calculate descriptive statistics for each numeric feature (including Q1, Q2 (median), Q3)
numeric_data = data.drop(columns=categorical_columns)
numeric_summary = numeric_data.describe(percentiles=[0.25, 0.5, 0.75]).T  # Transpose for better readability
numeric_summary['IQR'] = numeric_summary['75%'] - numeric_summary['25%']  # Calculate IQR

print("\nNumeric Features Summary (including Q1, Q2, Q3, IQR):\n", numeric_summary)

# Check percentage of extreme values (outliers)
# Calculate percentage of outliers where Z-score exceeds threshold
outliers_percentage = {}

for col in numeric_data.columns:
    z_scores = stats.zscore(numeric_data[col].dropna())  # Exclude missing values and calculate Z-score
    outliers_percentage[col] = (np.abs(z_scores) > 3).mean() * 100  # Calculate the percentage of outliers where Z-score > 3

outliers_percentage_df = pd.DataFrame.from_dict(outliers_percentage, orient='index', columns=['Outliers (%)'])
print("\nPercentage of Outliers in Each Numeric Feature:\n", outliers_percentage_df)


In [None]:
# Check unique values of One-Hot Encoded features
one_hot_columns = [col for col in data.columns if 'HEAT_' in col or 'STYLE_' in col or 'STRUCT_' in col]  # Replace with the prefixes of features you used for One-Hot Encoding
for col in one_hot_columns:
    print(f"{col} unique values:", data[col].unique())

In [None]:
data['Years_Since_Remodel'] = data['Years_Since_Remodel'].apply(lambda x: x if x >= 0 else np.nan)
data['Years_Between_Built_and_Remodel'] = data['Years_Between_Built_and_Remodel'].apply(lambda x: x if x >= 0 else np.nan)

# fill NaN with median
data['Years_Since_Remodel'].fillna(data['Years_Since_Remodel'].median(), inplace=True)
data['Years_Between_Built_and_Remodel'].fillna(data['Years_Between_Built_and_Remodel'].median(), inplace=True)


In [None]:
print("Unique values in Sale_Month:", data['Sale_Month'].unique())
print("Unique values in Cluster:", data['Cluster'].unique())
print("Unique values in Years_Since_Remodel:", data['Years_Since_Remodel'].unique())


## --- Above Unused ---

In [None]:
data[categorical_columns]
data

In [None]:
data = final_data.copy()

# fill missing values
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# standardize numeric features
exclude_columns = ['Sale_Month', 'Cluster']
numeric_columns_for_scaling = [col for col in numeric_columns if col not in exclude_columns]

# Standardize numeric features
scaler = StandardScaler()
data[numeric_columns_for_scaling] = scaler.fit_transform(data[numeric_columns_for_scaling])

# Check data
print("Unique values in Sale_Month after processing:", data['Sale_Month'].unique())
print("Unique values in Cluster after processing:", data['Cluster'].unique())

In [None]:
# Change data type to int
data['Cluster'] = data['Cluster'].astype(int)

# Check data
print("Unique values in Cluster after conversion:", data['Cluster'].unique())

## Remove Outliers

In [None]:
def remove_outliers_iqr(df, columns, iqr_multiplier=1.5):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR
        df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), np.nan, df[column])
    df = df.dropna()
    return df

# Remove outliers using IQR method
data_cleaned = remove_outliers_iqr(data, numeric_columns, iqr_multiplier=1.5)
print("Data shape after adjusting IQR threshold:", data_cleaned.shape)


In [None]:
def remove_outliers_zscore(df, columns, z_threshold=3):
    for column in columns:
        # Calculate Z-score and replace values with NaN if Z-score > threshold
        z_scores = np.abs(stats.zscore(df[column].fillna(df[column].median())))  # Fill NaN values with median
        # Replace values with NaN if Z-score > threshold
        df[column] = np.where(z_scores > z_threshold, np.nan, df[column])
    # Drop rows with NaN values
    df = df.dropna()
    return df

# Remove outliers using Z-score method
data_cleaned = remove_outliers_zscore(data, numeric_columns, z_threshold=3)
print("Data shape after removing outliers with Z-score:", data_cleaned.shape)


In [None]:
# Remove outliers for key features
important_columns = ['PRICE', 'ROOMS', 'GBA']  # Define important columns for outlier removal
data_important_cleaned = remove_outliers_iqr(data, important_columns, iqr_multiplier=1.5)
data_important_cleaned = remove_outliers_zscore(data_important_cleaned, important_columns, z_threshold=2.5)
print("Data shape after stricter outlier removal for key features:", data_important_cleaned.shape)

In [None]:
def quantile_clipping(df, columns, lower_quantile=0.025, upper_quantile=0.975):
    for column in columns:
        lower_bound = df[column].quantile(lower_quantile)
        upper_bound = df[column].quantile(upper_quantile)
        df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

# Apply quantile clipping to the entire dataset
data_clipped = quantile_clipping(data, numeric_columns)
print("Data shape after quantile clipping:", data_clipped.shape)

## Remove NaN

In [None]:
# Step 1: Initial NaN handling for numeric and categorical columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Check missing values after initial processing
print("Missing values after initial processing:", data.isnull().sum().sum())

## Standardize

In [None]:
# Step 2: Standardize numeric features
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Detect and handle outliers using Z-score with an adjusted threshold
z_scores = np.abs(stats.zscore(data[numeric_columns]))
data[numeric_columns] = np.where(z_scores > 2.5, np.nan, data[numeric_columns])
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

## Split Dataset

In [None]:
# Transform target variable and split dataset
y = np.log1p(data['PRICE'])  # Apply log transformation to target variable
X = data.drop(columns=['PRICE'])

# Ensure that only numeric columns are included in X for model training
X_numeric = X.select_dtypes(include=[np.number])
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Fill any remaining NaN values with median in both training and test sets
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

## Use PCA to Reduce Dimensionality

In [None]:
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Original feature count:", X_train.shape[1])
print("Reduced feature count with PCA:", X_train_pca.shape[1])

## Check NaN

In [None]:
print("NaN values in y_train:", y_train.isnull().sum())
print("NaN values in y_test:", np.isnan(y_test).sum())
# print("NaN values in y_pred_lr:", np.isnan(y_pred_lr).sum())


## Processing NAN

In [None]:
# Fill NaN values in y_train and y_test with median
y_train.fillna(y_train.median(), inplace=True)
y_test = y_test.fillna(y_test.median())
print("NaN values in y_train after processing:", y_train.isnull().sum())
print("NaN values in y_test after processing:", np.isnan(y_test).sum())

# # Fill NaN values in y_pred_lr with the mean
# y_pred_lr = np.nan_to_num(y_pred_lr, nan=np.mean(y_pred_lr))
# print("NaN values in y_pred_lr after processing:", np.isnan(y_pred_lr).sum())

## Training/Evaluating Models

In [None]:
# Train and evaluate Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_pca, y_train)
y_pred_lr = np.expm1(linear_model.predict(X_test_pca))

# Train and evaluate Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_pca, y_train)
y_pred_rf = np.expm1(rf_model.predict(X_test_pca))

# Calculate RMSE and R^2 for both models
rmse_lr = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred_lr))
r2_lr = r2_score(np.expm1(y_test), y_pred_lr)
rmse_rf = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred_rf))
r2_rf = r2_score(np.expm1(y_test), y_pred_rf)

print("Linear Regression RMSE:", rmse_lr, "R^2:", r2_lr)
print("Random Forest RMSE:", rmse_rf, "R^2:", r2_rf)