## 1. Import packages

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

## 2. Load data

In [2]:
df = pd.read_csv('./NY-House-Dataset.csv')

### Show the first 5 rows of the data

In [3]:
df.head()

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,ADDRESS,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME,LONG_NAME,FORMATTED_ADDRESS,LATITUDE,LONGITUDE
0,Brokered by Douglas Elliman -111 Fifth Ave,Condo for sale,315000,2,2.0,1400.0,2 E 55th St Unit 803,"New York, NY 10022","2 E 55th St Unit 803New York, NY 10022",New York County,New York,Manhattan,East 55th Street,Regis Residence,"Regis Residence, 2 E 55th St #803, New York, N...",40.761255,-73.974483
1,Brokered by Serhant,Condo for sale,195000000,7,10.0,17545.0,Central Park Tower Penthouse-217 W 57th New Yo...,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,United States,New York,New York County,New York,West 57th Street,"217 W 57th St, New York, NY 10019, USA",40.766393,-73.980991
2,Brokered by Sowae Corp,House for sale,260000,4,2.0,2015.0,620 Sinclair Ave,"Staten Island, NY 10312","620 Sinclair AveStaten Island, NY 10312",United States,New York,Richmond County,Staten Island,Sinclair Avenue,"620 Sinclair Ave, Staten Island, NY 10312, USA",40.541805,-74.196109
3,Brokered by COMPASS,Condo for sale,69000,3,1.0,445.0,2 E 55th St Unit 908W33,"Manhattan, NY 10022","2 E 55th St Unit 908W33Manhattan, NY 10022",United States,New York,New York County,New York,East 55th Street,"2 E 55th St, New York, NY 10022, USA",40.761398,-73.974613
4,Brokered by Sotheby's International Realty - E...,Townhouse for sale,55000000,7,2.373861,14175.0,5 E 64th St,"New York, NY 10065","5 E 64th StNew York, NY 10065",United States,New York,New York County,New York,East 64th Street,"5 E 64th St, New York, NY 10065, USA",40.767224,-73.969856


In [4]:
df['BEDS'].max(), df['BEDS'].min(), df['BEDS'].mean()

(50, 1, 3.3568006665278065)

In [5]:
df['BATH'].max(), df['BATH'].min(), df['BATH'].mean()

(50.0, 0.0, 2.3738608579684373)

In [6]:
df['PROPERTYSQFT'].max(), df['PROPERTYSQFT'].min(), df['PROPERTYSQFT'].mean()


(65535.0, 230.0, 2184.207861758384)

In [7]:
df['LATITUDE'].max(), df['LATITUDE'].min(), df['LATITUDE'].mean()

(40.9127295, 40.4995462, 40.71422708323266)

In [8]:
df['LONGITUDE'].max(), df['LONGITUDE'].min(), df['LONGITUDE'].mean()

(-73.70245, -74.2530332, -73.94160121297645)

In [9]:
df['PRICE'].max(), df['PRICE'].min(), df['PRICE'].mean()

(2147483647, 2494, 2356940.17100604)

### Show information about the data

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4801 entries, 0 to 4800
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BROKERTITLE                  4801 non-null   object 
 1   TYPE                         4801 non-null   object 
 2   PRICE                        4801 non-null   int64  
 3   BEDS                         4801 non-null   int64  
 4   BATH                         4801 non-null   float64
 5   PROPERTYSQFT                 4801 non-null   float64
 6   ADDRESS                      4801 non-null   object 
 7   STATE                        4801 non-null   object 
 8   MAIN_ADDRESS                 4801 non-null   object 
 9   ADMINISTRATIVE_AREA_LEVEL_2  4801 non-null   object 
 10  LOCALITY                     4801 non-null   object 
 11  SUBLOCALITY                  4801 non-null   object 
 12  STREET_NAME                  4801 non-null   object 
 13  LONG_NAME         

## 3. Data preprocessing

In [11]:
z_scores = stats.zscore(df['PRICE'])

# Define a threshold for z-scores (e.g., 3)
threshold = 3

# Identify and remove outliers based on the threshold
df_no_outliers = df[np.abs(z_scores) <= threshold]

In [12]:
features = ['BEDS', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE']
target = 'PRICE'

In [13]:
X = df_no_outliers[features]
y = df_no_outliers[target]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(3839, 5) (3839,) (960, 5) (960,)


## 4. Define the model

In [16]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [17]:
rf_regressor.fit(X_train, y_train)

### Prediction

In [18]:
y_pred = rf_regressor.predict(X_test)

In [19]:
# Evaluate the model
mse = root_mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 2662663.357235026
Root Mean Squared Error: 1631.7669432964458
R2 Score: 0.6917918620408712


## Save the model

In [20]:
import pickle

with open("rf_regressor.pkl", "wb") as model_file:
    pickle.dump(rf_regressor, model_file)