In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data_loc = "house_price_dataset.csv"
df = pd.read_csv(data_loc)

# get list of features
print("Columns: \n",df.columns.tolist())

# Create a binary target: 1 = expensive, 0 = affordable
price_threshold = df["price"].median()
df["is_expensive"] = (df["price"] > price_threshold).astype(int)

# Drop the original price from features
X = df.drop(columns=["price", "is_expensive"])
y = df["is_expensive"]

# One-hot encode categorical columns (like door_color, wallpaper_pattern)
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

Columns: 
 ['square_feet', 'num_rooms', 'num_bathrooms', 'floor_number', 'distance_to_city_km', 'house_age_years', 'lot_area_sqft', 'garage_size', 'nearby_schools_rating', 'crime_rate_index', 'door_color', 'wallpaper_pattern', 'lucky_number', 'favorite_emoji', 'wifi_name_length', 'random_code', 'pet_name_length', 'does_owner_like_coffee', 'car_color', 'weekend_activity_score', 'price']


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

# Get feature importances
importances = rf.feature_importances_
fi_df = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(fi_df.head(10))

Random Forest Accuracy: 0.94
                feature  importance
0           square_feet    0.563966
6         lot_area_sqft    0.038607
4   distance_to_city_km    0.036506
12          random_code    0.034574
9      crime_rate_index    0.033632
5       house_age_years    0.033394
3          floor_number    0.030672
10         lucky_number    0.030182
1             num_rooms    0.026982
11     wifi_name_length    0.023803


In [8]:
from sklearn.inspection import permutation_importance

# Use the already trained RandomForest model: rf
result = permutation_importance(
    rf, X_test, y_test,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

perm_importances = result.importances_mean
perm_df = pd.DataFrame({
    "feature": X.columns,
    "perm_importance": perm_importances
}).sort_values(by="perm_importance", ascending=False)

print(perm_df.head(10))

                   feature  perm_importance
0              square_feet         0.431333
1                num_rooms         0.006000
12             random_code         0.000667
5          house_age_years         0.000667
2            num_bathrooms         0.000000
7              garage_size         0.000000
10            lucky_number         0.000000
13         pet_name_length         0.000000
11        wifi_name_length         0.000000
14  does_owner_like_coffee         0.000000


In [9]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_df = pd.DataFrame({
    "feature": X.columns,
    "mi_score": mi_scores
}).sort_values(by="mi_score", ascending=False)

print(mi_df.head(10))

                        feature  mi_score
0                   square_feet  0.555182
18               door_color_Red  0.056999
7                   garage_size  0.042050
20  wallpaper_pattern_Geometric  0.031641
27               car_color_Blue  0.027811
10                 lucky_number  0.015518
5               house_age_years  0.013739
19             door_color_White  0.012926
14       does_owner_like_coffee  0.009081
8         nearby_schools_rating  0.005328
