In [34]:
import pandas as pd
import os

# Define the directory containing your files
directory = 'data'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.startswith('geocoded_addresses') and f.endswith('.csv')]

# Sort the files by name to maintain the order (optional, depending on your needs)
csv_files.sort()

# Create an empty list to hold DataFrames
dataframes = []

# Loop through the files and read each one into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all the DataFrames in the list into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Now you can use combined_df as your single DataFrame containing all data
print(combined_df.head())  # to display the first few rows of the combined DataFrame
df = combined_df


                                   Address  Property Type  Bedrooms  \
0                   2 Dunman Road (439188)              0         5   
1                            Shelford Road              1         5   
2  60H Kent Ridge Hill Residences (117321)              0         1   
3                               Marina Way              0         1   
4                           Lentor Central              0         1   

   Bathrooms  Asking Price  Size  Age  District  Years_Left  Primary Schools  \
0          3       3764000  1679   54        15        45.0                3   
1          4      10000000  5134   41        11      9958.0                2   
2          1       1030000   474    1         5        98.0                0   
3          1       1630000   700    7         1        92.0                1   
4          1       1358000   527   94        26         5.0                3   

   Secondary Schools  Shopping Malls  Groceries & Supermarts  \
0                  3        

In [35]:
len(df)

21369

In [36]:
df.dropna(subset=['Latitude', 'Longitude','Years_Left'], inplace=True)

In [37]:
import pandas as pd

# Assuming df is your DataFrame
# Drop the 'Address' column
df = df.drop(columns=['Address'])

# Ensure 'Years_Left' is of type float
df['Years_Left'] = df['Years_Left'].astype(float)


In [38]:
# One-hot encode 'District'
df = pd.get_dummies(df, columns=['District'])


In [39]:
import pandas as pd
from scipy.spatial import cKDTree
import numpy as np
def add_spatial_density(df, radius=0.0005):  # Adjust the radius based on your geographic context
    # Convert latitude and longitude to radians for use in KDTree (which assumes spherical earth)
    coords = np.radians(df[['Latitude', 'Longitude']].values)
    tree = cKDTree(coords)
    
    # Query the KDTree to count neighbors within the specified radius
    counts = tree.query_ball_point(coords, radius, return_length=True)
    
    # Add the counts as a new column to the dataframe
    df['spatial_density'] = counts
    return df


In [40]:
df = add_spatial_density(df)


In [41]:
len(df)

16431

In [42]:
df['spatial_density'].value_counts()

364     240
920     146
980     136
3543    132
1831    121
       ... 
1520      1
2175      1
1337      1
1238      1
2256      1
Name: spatial_density, Length: 1767, dtype: int64

In [43]:
from sklearn.preprocessing import StandardScaler
import joblib
# Assuming df is your DataFrame and selecting numeric columns for scaling
numeric_cols = ['Bedrooms', 'Bathrooms', 'Size', 'Age', 'Years_Left', 
                'No. of Amenities','spatial_density']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

In [44]:
df.columns

Index(['Property Type', 'Bedrooms', 'Bathrooms', 'Asking Price', 'Size', 'Age',
       'Years_Left', 'Primary Schools', 'Secondary Schools', 'Shopping Malls',
       'Groceries & Supermarts', 'No. of Amenities', 'Latitude', 'Longitude',
       'District_1', 'District_2', 'District_3', 'District_4', 'District_5',
       'District_6', 'District_7', 'District_8', 'District_9', 'District_10',
       'District_11', 'District_12', 'District_13', 'District_14',
       'District_15', 'District_16', 'District_17', 'District_18',
       'District_19', 'District_20', 'District_21', 'District_22',
       'District_23', 'District_24', 'District_25', 'District_26',
       'District_27', 'District_28', 'spatial_density'],
      dtype='object')

In [45]:
df.head()

Unnamed: 0,Property Type,Bedrooms,Bathrooms,Asking Price,Size,Age,Years_Left,Primary Schools,Secondary Schools,Shopping Malls,...,District_20,District_21,District_22,District_23,District_24,District_25,District_26,District_27,District_28,spatial_density
0,0,2.132232,0.597814,3764000,0.024462,1.314357,-0.731087,3,3,3,...,0,0,0,0,0,0,0,0,0,-0.393894
1,1,2.132232,1.459755,10000000,0.281411,0.803582,1.386332,2,3,1,...,0,0,0,0,0,0,0,0,0,-0.339243
2,0,-1.435366,-1.126068,1030000,-0.065154,-0.768034,-0.719766,0,0,3,...,0,0,0,0,0,0,0,0,0,-0.806122
3,0,-1.435366,-1.126068,1630000,-0.048346,-0.532292,-0.721048,1,0,3,...,0,0,0,0,0,0,0,0,0,0.334791
4,0,-1.435366,-1.126068,1358000,-0.061212,2.885974,-0.739631,3,3,1,...,0,0,0,0,0,0,1,0,0,-1.189722


# Creating the model

In [46]:
# Example price thresholds
low_price_threshold = 1300000
high_price_threshold = 3000000

# Splitting the dataset
low_price_data = df[df['Asking Price'] <= low_price_threshold]
mid_price_data = df[(df['Asking Price'] > low_price_threshold) & (df['Asking Price'] <= high_price_threshold)]
high_price_data = df[df['Asking Price'] > high_price_threshold]


In [47]:
import os
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
def train_rf_and_evaluate(data, segment_name):
    X = data.drop(['Asking Price', 'Latitude', 'Longitude','Primary Schools', 'Secondary Schools', 'Shopping Malls',
       'Groceries & Supermarts'], axis=1)
    y = data['Asking Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf = RandomForestRegressor()
    param_grid = {
        'n_estimators': [100, 130, 150, 250, 300],
        'max_depth': [20, 30, 70, 150],
        'min_samples_split': [2, 5, 8]
    }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    
    y_pred = best_rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    relative_error = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    print(f'{segment_name} - Random Forest MSE: {mse}')
    print(f'{segment_name} - Random Forest MAE: {mae}')
    print(f'{segment_name} - Relative Error: {relative_error}%')
    print(f"{segment_name} - Best Parameters:", grid_search.best_params_)
    
    # Save the best model in the current working directory
    save_path = os.getcwd()
    save_file = os.path.join(save_path, f"best_rf_{segment_name}.joblib")
    joblib.dump(best_rf, save_file)

# Assuming low_price_data, mid_price_data, and high_price_data are already defined and appropriately preprocessed
print("Random Forest - Low Price Segment:")
train_rf_and_evaluate(low_price_data, "Low Price")

print("Random Forest - Mid Price Segment:")
train_rf_and_evaluate(mid_price_data, "Mid Price")

print("Random Forest - High Price Segment:")
train_rf_and_evaluate(high_price_data, "High Price")


Random Forest - Low Price Segment:
Low Price - Random Forest MSE: 17372927527.72321
Low Price - Random Forest MAE: 71516.8460978109
Low Price - Relative Error: 251.2317554640509%
Low Price - Best Parameters: {'max_depth': 20, 'min_samples_split': 8, 'n_estimators': 250}
Random Forest - Mid Price Segment:
Mid Price - Random Forest MSE: 31081632701.719547
Mid Price - Random Forest MAE: 118941.05987377919
Mid Price - Relative Error: 6.1257147499184015%
Mid Price - Best Parameters: {'max_depth': 150, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest - High Price Segment:
High Price - Random Forest MSE: 2361476371942.3457
High Price - Random Forest MAE: 592971.2555504533
High Price - Relative Error: 8.135819660813187%
High Price - Best Parameters: {'max_depth': 150, 'min_samples_split': 2, 'n_estimators': 130}


In [50]:
X = df.drop(['Asking Price', 'Latitude', 'Longitude','Primary Schools', 'Secondary Schools', 'Shopping Malls',
       'Groceries & Supermarts'], axis=1)

In [53]:
X['Property Type'].value_counts()

0    8843
1    7588
Name: Property Type, dtype: int64

In [15]:
import os
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
def train_rf_and_evaluate(data, segment_name):
    X = data.drop(['Asking Price', 'Latitude', 'Longitude'], axis=1)
    y = data['Asking Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf = RandomForestRegressor()
    param_grid = {
        'n_estimators': [100, 130, 150, 250, 300],
        'max_depth': [20, 30, 70, 150],
        'min_samples_split': [2, 5, 8]
    }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    
    y_pred = best_rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    relative_error = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    print(f'{segment_name} - Random Forest MSE: {mse}')
    print(f'{segment_name} - Random Forest MAE: {mae}')
    print(f'{segment_name} - Relative Error: {relative_error}%')
    print(f"{segment_name} - Best Parameters:", grid_search.best_params_)
    
    # Save the best model in the current working directory
    save_path = os.getcwd()
    save_file = os.path.join(save_path, f"best_rf_{segment_name}.joblib")
    joblib.dump(best_rf, save_file)
