In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
# Load the dataset
data = pd.read_csv("datasets/dataset.csv")


In [3]:
# Drop irrelevant features
data.drop(['id', 'Date', 'Postal Code', 'Lattitude', 'Longitude'], axis=1, inplace=True)


In [4]:
# Convert float features to integers
float_to_int_cols = ['number of bedrooms', 'number of bathrooms', 'number of floors']
data[float_to_int_cols] = data[float_to_int_cols].astype(int)


In [5]:
# Convert categorical features to integers if not already
categorical_cols = ['waterfront present', 'number of views', 'condition of the house', 'grade of the house', 'Number of schools nearby']
for col in categorical_cols:
    if data[col].dtype != 'int64':
        data[col] = data[col].astype('category').cat.codes
        

In [6]:
# Convert 'Built Year' and 'Renovation Year' to integers
data['Built Year'] = data['Built Year'].astype(int)
data['Renovation Year'] = data['Renovation Year'].astype(int)


In [7]:
# Scaling numeric features using Min-Max scaling
#scaler = MinMaxScaler()
#numeric_cols = ['living area', 'lot area', 'Area of the house(excluding basement)', 'Area of the basement', 'living_area_renov', 'lot_area_renov', 'Distance from the airport', 'Price']
#data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


In [8]:
# Split features and target variable
X = data.drop("Price", axis=1)
y = data["Price"]


In [9]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


In [10]:
# Concatenate features and target variable for training and validation sets
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)


In [11]:
# Save the training and validation datasets into separate CSV files
train_data.to_csv("datasets/train_dataset.csv", index=False)
val_data.to_csv("datasets/val_dataset.csv", index=False)
