In [125]:
import pandas as pd
import os

# Define the directory containing your files
directory = 'data'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.startswith('geocoded_addresses') and f.endswith('.csv')]

# Sort the files by name to maintain the order (optional, depending on your needs)
csv_files.sort()

# Create an empty list to hold DataFrames
dataframes = []

# Loop through the files and read each one into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all the DataFrames in the list into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Now you can use combined_df as your single DataFrame containing all data
print(combined_df.head())  # to display the first few rows of the combined DataFrame
combined_df = df


                                   Address  Property Type  Bedrooms  \
0                   2 Dunman Road (439188)              0         5   
1                            Shelford Road              1         5   
2  60H Kent Ridge Hill Residences (117321)              0         1   
3                               Marina Way              0         1   
4                           Lentor Central              0         1   

   Bathrooms  Asking Price  Size  Age  District  Years_Left  Primary Schools  \
0          3       3764000  1679   54        15        45.0                3   
1          4      10000000  5134   41        11      9958.0                2   
2          1       1030000   474    1         5        98.0                0   
3          1       1630000   700    7         1        92.0                1   
4          1       1358000   527   94        26         5.0                3   

   Secondary Schools  Shopping Malls  Groceries & Supermarts  \
0                  3        

In [129]:
df.dropna(subset=['Latitude', 'Longitude','Years_Left'], inplace=True)

In [130]:
import pandas as pd

# Assuming df is your DataFrame
# Drop the 'Address' column
df = df.drop(columns=['Address'])

# Ensure 'Years_Left' is of type float
df['Years_Left'] = df['Years_Left'].astype(float)


In [131]:
# One-hot encode 'District'
df = pd.get_dummies(df, columns=['District'])


In [132]:
import geohash2

# Define a function to apply Geohash encoding
def encode_geohash(df, lat_col='Latitude', lon_col='Longitude', precision=5):
    """ Encodes latitude and longitude into geohash. """
    df['geohash'] = df.apply(lambda x: geohash2.encode(x[lat_col], x[lon_col], precision=precision), axis=1)
    return df

# Apply the function to your dataframe
df = encode_geohash(df)

In [134]:
def target_encode_geohash(df, target_col):
    """ Target encodes the geohash to find average values of the target variable per geohash. """
    # Calculate mean target per geohash
    geohash_target_mean = df.groupby('geohash')[target_col].mean().reset_index(name='geohash_target_mean')
    
    # Merge this back on the original dataframe
    df = df.merge(geohash_target_mean, how='left', on='geohash')
    return df

# Assuming your target column is named appropriately
df = target_encode_geohash(df, target_col='Asking Price')

In [137]:
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame and selecting numeric columns for scaling
numeric_cols = ['Bedrooms', 'Bathrooms', 'Size', 'Age', 'Years_Left', 
                'Primary Schools', 'Secondary Schools', 'Shopping Malls', 'Groceries & Supermarts', 
                'No. of Amenities','geohash_target_mean']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


# Creating the model

In [139]:
from sklearn.model_selection import train_test_split

# Assume 'df' is your DataFrame and it's ready for splitting
# 'X' contains all features except the target variable 'Asking Price'
# 'y' is the target variable 'Asking Price'# Exclude the 'geohash' column from the feature set
X = df.drop(['Asking Price', 'geohash','Latitude', 'Longitude', 'Primary Schools', 'Secondary Schools', 'Shopping Malls', 'Groceries & Supermarts',], axis=1)  # Assuming 'Asking Price' is your target variable
y = df['Asking Price']

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [104]:
# Example price thresholds
low_price_threshold = 1300000
high_price_threshold = 3000000

# Splitting the dataset
low_price_data = df[df['Asking Price'] <= low_price_threshold]
mid_price_data = df[(df['Asking Price'] > low_price_threshold) & (df['Asking Price'] <= high_price_threshold)]
high_price_data = df[df['Asking Price'] > high_price_threshold]


In [147]:
import os
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
def train_rf_and_evaluate(data, segment_name):
    X = data.drop(['Asking Price', 'geohash', 'Latitude', 'Longitude'], axis=1)
    y = data['Asking Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf = RandomForestRegressor()
    param_grid = {
        'n_estimators': [100, 130, 150, 250, 300],
        'max_depth': [20, 30, 70, 150],
        'min_samples_split': [2, 5, 8]
    }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    
    y_pred = best_rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    relative_error = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    print(f'{segment_name} - Random Forest MSE: {mse}')
    print(f'{segment_name} - Random Forest MAE: {mae}')
    print(f'{segment_name} - Relative Error: {relative_error}%')
    print(f"{segment_name} - Best Parameters:", grid_search.best_params_)
    
    # Save the best model in the current working directory
    save_path = os.getcwd()
    save_file = os.path.join(save_path, f"best_rf_{segment_name}.joblib")
    joblib.dump(best_rf, save_file)

# Assuming low_price_data, mid_price_data, and high_price_data are already defined and appropriately preprocessed
print("Random Forest - Low Price Segment:")
train_rf_and_evaluate(low_price_data, "Low Price")

print("Random Forest - Mid Price Segment:")
train_rf_and_evaluate(mid_price_data, "Mid Price")

print("Random Forest - High Price Segment:")
train_rf_and_evaluate(high_price_data, "High Price")


Random Forest - Low Price Segment:
Low Price - Random Forest MSE: 17910844911.29493
Low Price - Random Forest MAE: 71287.36641296934
Low Price - Relative Error: 259.12877192742553%
Low Price - Best Parameters: {'max_depth': 150, 'min_samples_split': 5, 'n_estimators': 130}
Random Forest - Mid Price Segment:
Mid Price - Random Forest MSE: 30468954669.377533
Mid Price - Random Forest MAE: 116911.53721549566
Mid Price - Relative Error: 6.04629555148975%
Mid Price - Best Parameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest - High Price Segment:
High Price - Random Forest MSE: 2558143125667.311
High Price - Random Forest MAE: 595042.6778322255
High Price - Relative Error: 8.186610528423794%
High Price - Best Parameters: {'max_depth': 70, 'min_samples_split': 2, 'n_estimators': 100}
