In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

#### Loading the DataSet

In [None]:
ds=pd.read_csv('House Price Prediction Dataset.csv')

In [None]:
ds.head(2)

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998


#### Training the Model

In [None]:
train,test=train_test_split(ds,test_size=0.33,random_state=42)

In [None]:
txtcol=['Location','Condition','Garage']
numerical_col = ['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt']
exclude_col=['Id','Price']

train_x=train.drop(columns=exclude_col)

# here in our data loc cond and garage has test data we nee to convert it in binary to train the model 

train_x_encoded=pd.get_dummies(train_x,columns=txtcol,drop_first=True)

scaler=StandardScaler()
scaler.fit(train_x_encoded[numerical_col])

# Transform the training data
train_scaled = scaler.transform(train_x_encoded[numerical_col])
train_scaled_df = pd.DataFrame(train_scaled, columns=numerical_col, index=train_x_encoded.index)

train_x_final = train_x_encoded.drop(columns=numerical_col)
train_x_final[numerical_col] = train_scaled_df

train_y=train['Price']

test_x=test.drop(columns=exclude_col)

test_x_encoded=pd.get_dummies(test_x,columns=txtcol,drop_first=True)

missing_cols = set(train_x_final.columns) - set(test_x_encoded.columns)

for c in missing_cols:
    test_x_encoded[c] = 0 # Add missing columns with a value of 0
test_x_encoded = test_x_encoded[train_x_final.columns]

test_scaled = scaler.transform(test_x_encoded[numerical_col])
test_scaled_df = pd.DataFrame(test_scaled, columns=numerical_col, index=test_x_encoded.index)

# Replace unscaled columns with scaled ones
test_x_final = test_x_encoded.drop(columns=numerical_col)
test_x_final[numerical_col] = test_scaled_df

test_y=test['Price']

#### Prediction

In [None]:
clf=LinearRegression().fit(train_x_final,train_y)

y_pred=clf.predict(test_x_final)

# Get the 'Location' column from the original test data
location_data = test['Location'].to_frame()

# Add the actual and predicted prices, ensuring the indices align
location_data['Actual Price'] = test_y
location_data['Predicted Price'] = y_pred

average_prices_by_location = location_data.groupby('Location')[['Actual Price', 'Predicted Price']].mean()

print(average_prices_by_location.sort_values(by='Predicted Price' , ascending=False).to_markdown(floatfmt=",.2f"))

| Location   |   Actual Price |   Predicted Price |
|:-----------|---------------:|------------------:|
| Suburban   |     591,775.92 |        538,732.60 |
| Downtown   |     532,352.98 |        535,725.24 |
| Rural      |     555,284.10 |        530,654.45 |
| Urban      |     519,342.81 |        515,165.81 |
