In [54]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [55]:
# Load the dataset
df = pd.read_csv("datasets/dataset.csv")


In [56]:
# Drop irrelevant features
df.drop(['Locality', 'Status', 'Transaction', 'Type', 'Per_Sqft'], axis=1, inplace=True)


In [57]:
# Continuous features
continuous_features = ['Area', 'BHK', 'Bathroom', 'Parking', 'Price']


In [58]:
# Handle missing or non-finite values
for feature in continuous_features:
    # Replace missing values with median
    median = df[feature].median()
    df[feature].fillna(median, inplace=True)
    # Convert to integer
    df[feature] = df[feature].astype(int)
    

In [59]:
# Categorical features
categorical_features = ['Furnishing']


In [60]:
# Convert categorical features into numerical using Label Encoding
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    df[feature] = label_encoders[feature].fit_transform(df[feature])


In [61]:
# Split features and target variable
X = df.drop("Price", axis=1)
y = df["Price"]


In [62]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


In [63]:
# Concatenate features and target variable for training and validation sets
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)


In [64]:
# Save the training and validation datasets into separate CSV files
train_data.to_csv("datasets/train_dataset.csv", index=False)
val_data.to_csv("datasets/val_dataset.csv", index=False)
