In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the data
file_path = "Bengaluru_House_Data.csv"
house_data = pd.read_csv(file_path)

# Extract numeric information from 'size' column
house_data['size'] = house_data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else x)

# Clean 'total_sqft' column by removing non-numeric values
house_data['total_sqft'] = pd.to_numeric(house_data['total_sqft'], errors='coerce')

# Drop rows with NaN values in 'total_sqft'
house_data = house_data.dropna(subset=['total_sqft'])

# Separate features and target
inputs = house_data.drop(['area_type', 'availability', 'society', 'balcony', 'price'], axis=1)
output = house_data['price']

# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(inputs, output, test_size=0.2, random_state=2)

# Define preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['size', 'total_sqft', 'bath']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['location'])
    ])

# Define the model with preprocessing steps
model = make_pipeline(
    preprocessor,
    XGBRegressor()
)

# Train the model
model.fit(X_train, Y_train)

# Predictions on training and test data
training_data_prediction = model.predict(X_train)
test_data_prediction = model.predict(X_test)

# Evaluate the model
mae_train = mean_absolute_error(Y_train, training_data_prediction)
mse_train = mean_squared_error(Y_train, training_data_prediction)
r2_train = r2_score(Y_train, training_data_prediction)

mae_test = mean_absolute_error(Y_test, test_data_prediction)
mse_test = mean_squared_error(Y_test, test_data_prediction)
r2_test = r2_score(Y_test, test_data_prediction)

print("Mean Absolute Error on Training Data:", mae_train)
print("Mean Squared Error on Training Data:", mse_train)
print("R-squared on Training Data:", r2_train)

print("\nMean Absolute Error on Test Data:", mae_test)
print("Mean Squared Error on Test Data:", mse_test)
print("R-squared on Test Data:", r2_test)

# Predictions for user input
user_location = input("Enter the location: ")
user_size = int(input("Enter the size: "))  # Assuming size is an integer
user_total_sqft = float(input("Enter the total sqft: "))  # Assuming total_sqft is a float
user_bath = float(input("Enter the number of bathrooms: "))  # Assuming bath is a float

# Create a DataFrame for the user input
user_input = pd.DataFrame({
    'location': [user_location],
    'size': [user_size],
    'total_sqft': [user_total_sqft],
    'bath': [user_bath]
})

# Predict the house price for the user input
user_prediction = model.predict(user_input)
print("Predicted Price for User Input:", user_prediction[0])


Mean Absolute Error on Training Data: 27.10425509652163
Mean Squared Error on Training Data: 2433.5437750832875
R-squared on Training Data: 0.8884735827772904

Mean Absolute Error on Test Data: 32.86631674221688
Mean Squared Error on Test Data: 6026.50070026775
R-squared on Test Data: 0.7563029770380554
Enter the location: Kengeri
Enter the size: 3
Enter the total sqft: 1767
Enter the number of bathrooms: 3
Predicted Price for User Input: 104.304146
