In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,bedrooms,bathrooms,square_feet,location,year_built,garage,has_pool,property_type,num_floors,has_basement,price,price_category
0,4,1,3463,Harlem,1993,1,1,Condo,2,1,1062805,High
1,5,3,3016,Staten Island,2019,0,0,Condo,3,1,984235,High
2,3,4,3591,Queens,1994,1,1,Condo,3,0,1090968,High
3,5,2,2017,Brooklyn,2009,0,1,House,2,0,947886,High
4,5,4,2042,Brooklyn,1991,1,1,Condo,2,0,986703,High


In [4]:
df['price_category'].unique()

array(['High', 'Low', 'Medium'], dtype=object)

In [6]:
X = df.drop(['price', 'price_category'], axis=1)  #
y = df['price']

In [8]:
categorical_features = ['location', 'property_type']
label_encoders = {}


In [9]:
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [12]:
y_pred = model.predict(X_test)

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae*100:.2f}")
print(f"RMSE: {rmse*100:.2f}")
print(f"R² Score: {r2*100:.4f}")


MAE: 2686971.41
RMSE: 3359759.20
R² Score: 96.4430


In [16]:
joblib.dump(model, 'Random_forest_price_regressor.pkl')


['Random_forest_price_regressor.pkl']

In [17]:
model = joblib.load('Random_forest_price_regressor.pkl')

In [18]:
import numpy as np
location_categories = ['Manhattan', 'Brooklyn', 'Bronx', 'Queens', 'Staten Island', 'Harlem']
property_type_categories = ['Apartment', 'Condo', 'House']

location_le = LabelEncoder()
location_le.classes_ = np.array(location_categories)

property_type_le = LabelEncoder()
property_type_le.classes_ = np.array(property_type_categories)

In [19]:
def get_int(prompt, min_val=None, max_val=None):
    while True:
        try:
            val = int(input(prompt))
            if (min_val is not None and val < min_val) or (max_val is not None and val > max_val):
                print(f"Please enter a value between {min_val} and {max_val}")
                continue
            return val
        except ValueError:
            print("Invalid input. Please enter an integer.")

In [20]:
def get_category(prompt, categories):
    categories_lower = [c.lower() for c in categories]
    while True:
        val = input(prompt).strip()
        if val.lower() in categories_lower:
            # Return with proper case
            return categories[categories_lower.index(val.lower())]
        else:
            print(f"Invalid input. Choose from: {categories}")

print("Please enter the property details below:")

Please enter the property details below:


In [21]:
bedrooms = get_int("Bedrooms (1-7): ", 1, 7)
bathrooms = get_int("Bathrooms (1-4): ", 1, 4)
square_feet = get_int("Square feet (600-4000): ", 600, 4000)
location = get_category("Location (Manhattan, Brooklyn, Bronx, Queens, Staten Island, Harlem): ", location_categories)
year_built = get_int("Year built (1980-2022): ", 1980, 2022)
garage = get_int("Garage (0 = No, 1 = Yes): ", 0, 1)
has_pool = get_int("Has pool (0 = No, 1 = Yes): ", 0, 1)
property_type = get_category("Property type (Apartment, Condo, House): ", property_type_categories)
num_floors = get_int("Number of floors (1-3): ", 1, 3)
has_basement = get_int("Has basement (0 = No, 1 = Yes): ", 0, 1)

Bedrooms (1-7): 4
Bathrooms (1-4): 1
Square feet (600-4000): 3463
Location (Manhattan, Brooklyn, Bronx, Queens, Staten Island, Harlem): Harlem
Year built (1980-2022): 1993
Garage (0 = No, 1 = Yes): 1
Has pool (0 = No, 1 = Yes): 1
Property type (Apartment, Condo, House): Condo
Number of floors (1-3): 2
Has basement (0 = No, 1 = Yes): 1


In [22]:

import numpy as np

location_encoded = location_le.transform(np.array([location]))[0]
property_type_encoded = property_type_le.transform(np.array([property_type]))[0]

In [23]:
input_df = pd.DataFrame([{
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'square_feet': square_feet,
    'location': location_encoded,
    'year_built': year_built,
    'garage': garage,
    'has_pool': has_pool,
    'property_type': property_type_encoded,
    'num_floors': num_floors,
    'has_basement': has_basement
}])


In [25]:
predicted_price = model.predict(input_df)[0]

print(f"\n Predicted Price : {predicted_price}")


 Predicted Price : 966849.76
