# Boston House Price Prediction - Efficient Implementation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Set visualization style
sns.set_style("darkgrid")

In [None]:
# Load data from CSV file
data = pd.read_csv('BostonHousing.csv')
print(f"Loaded dataset with {data.shape[0]} samples and {data.shape[1]} features")
data.head()

In [None]:
# Quick data analysis
print("Dataset Info:")
data.info()

# Check for missing values
missing = data.isnull().sum()
if missing.sum() > 0:
    print("\nMissing values:\n", missing[missing > 0])
else:
    print("\nNo missing values found")
    
# Statistical summary of target variable
print("\nPrice Statistics:")
print(data['price'].describe())

In [None]:
# Key visualizations
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Price distribution
sns.histplot(data['price'], kde=True, ax=axes[0])
axes[0].set_title('Price Distribution')

# Correlation heatmap of the most correlated features
corr = data.corr()
most_corr = corr['price'].sort_values(ascending=False)
top_features = ['price'] + list(most_corr[1:6].index)
sns.heatmap(data[top_features].corr(), annot=True, cmap='coolwarm', ax=axes[1])
axes[1].set_title('Correlation Heatmap')

plt.tight_layout()
plt.show()

print("Top correlations with price:")
print(most_corr.head(6))

In [None]:
# Model training and evaluation
# Prepare data
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Model Performance:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# Plot predicted vs actual
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
coefs = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.coef_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=coefs)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

print("Top 5 most important features:")
print(coefs.head(5))

In [None]:
# Save the model
with open('boston_model.pkl', 'wb') as file:
    pickle.dump(model, file)
print("Model saved as 'boston_model.pkl'")