# Boston Housing Price Analysis

This notebook analyzes the Boston housing dataset with data visualization and linear regression modeling.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Data

In [None]:
# Load the dataset
df = pd.read_csv('boston_house_prices.csv', skiprows=1)

# Display first few rows
print("Dataset shape:", df.shape)
df.head()

## 3. Data Overview

In [None]:
# Basic information
print("\nDataset Info:")
df.info()

print("\nStatistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

## 4. Data Visualization

### 4.1 Target Variable Distribution

In [None]:
# Distribution of house prices (MEDV)
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(df['MEDV'], bins=30, edgecolor='black', color='skyblue')
plt.xlabel('Median House Price (MEDV)')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')

plt.subplot(1, 2, 2)
plt.boxplot(df['MEDV'])
plt.ylabel('Median House Price (MEDV)')
plt.title('Box Plot of House Prices')

plt.tight_layout()
plt.show()

### 4.2 Correlation Heatmap

In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of All Features')
plt.tight_layout()
plt.show()

### 4.3 Feature Correlations with Target Variable

In [None]:
# Correlation with target variable (MEDV)
correlations = df.corr()['MEDV'].sort_values(ascending=False)
print("Correlations with House Price (MEDV):")
print(correlations)

# Visualize correlations
plt.figure(figsize=(10, 6))
correlations[1:].plot(kind='barh', color='teal')
plt.xlabel('Correlation Coefficient')
plt.title('Feature Correlations with House Price')
plt.tight_layout()
plt.show()

### 4.4 Key Feature Relationships

In [None]:
# Scatter plots for key features
key_features = ['RM', 'LSTAT', 'PTRATIO', 'DIS']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    axes[idx].scatter(df[feature], df['MEDV'], alpha=0.5, color='navy')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('MEDV (House Price)')
    axes[idx].set_title(f'{feature} vs House Price')
    
    # Add trend line
    z = np.polyfit(df[feature], df['MEDV'], 1)
    p = np.poly1d(z)
    axes[idx].plot(df[feature], p(df[feature]), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

### 4.5 Pairplot of Selected Features

In [None]:
# Pairplot for top correlated features
selected_features = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
sns.pairplot(df[selected_features], diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()

## 5. Linear Regression Analysis

### 5.1 Prepare Data

In [None]:
# Separate features and target
X = df.drop('MEDV', axis=1)
y = df['MEDV']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

### 5.2 Train Linear Regression Model

In [None]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained successfully!")

### 5.3 Model Coefficients

In [None]:
# Display model coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nModel Intercept:", model.intercept_)
print("\nFeature Coefficients:")
print(coef_df)

# Visualize coefficients
plt.figure(figsize=(10, 6))
plt.barh(coef_df['Feature'], coef_df['Coefficient'], color='steelblue')
plt.xlabel('Coefficient Value')
plt.title('Linear Regression Coefficients')
plt.tight_layout()
plt.show()

### 5.4 Make Predictions

In [None]:
# Predictions on training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

### 5.5 Model Evaluation

In [None]:
# Calculate metrics for training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Display results
print("="*50)
print("MODEL PERFORMANCE")
print("="*50)
print("\nTraining Set:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  MAE: {train_mae:.4f}")
print(f"  MSE: {train_mse:.4f}")

print("\nTest Set:")
print(f"  R² Score: {test_r2:.4f}")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE: {test_mae:.4f}")
print(f"  MSE: {test_mse:.4f}")
print("="*50)

### 5.6 Prediction vs Actual Plots

In [None]:
# Visualize predictions vs actual values
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.5, color='blue')
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price')
axes[0].set_ylabel('Predicted Price')
axes[0].set_title(f'Training Set: Actual vs Predicted\nR² = {train_r2:.4f}')
axes[0].grid(True, alpha=0.3)

# Test set
axes[1].scatter(y_test, y_test_pred, alpha=0.5, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Price')
axes[1].set_ylabel('Predicted Price')
axes[1].set_title(f'Test Set: Actual vs Predicted\nR² = {test_r2:.4f}')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 5.7 Residual Analysis

In [None]:
# Calculate residuals
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred

# Visualize residuals
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Residual plot - Training
axes[0, 0].scatter(y_train_pred, train_residuals, alpha=0.5, color='blue')
axes[0, 0].axhline(y=0, color='r', linestyle='--')
axes[0, 0].set_xlabel('Predicted Price')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Training Set: Residual Plot')
axes[0, 0].grid(True, alpha=0.3)

# Residual plot - Test
axes[0, 1].scatter(y_test_pred, test_residuals, alpha=0.5, color='green')
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Price')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Test Set: Residual Plot')
axes[0, 1].grid(True, alpha=0.3)

# Residual distribution - Training
axes[1, 0].hist(train_residuals, bins=30, edgecolor='black', color='skyblue')
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Training Set: Residual Distribution')

# Residual distribution - Test
axes[1, 1].hist(test_residuals, bins=30, edgecolor='black', color='lightgreen')
axes[1, 1].set_xlabel('Residuals')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Test Set: Residual Distribution')

plt.tight_layout()
plt.show()

## 6. Summary and Insights

In [None]:
print("="*70)
print("BOSTON HOUSING ANALYSIS SUMMARY")
print("="*70)
print("\n1. Dataset:")
print(f"   - Total samples: {len(df)}")
print(f"   - Features: {len(X.columns)}")
print(f"   - Target variable: MEDV (Median house price)")

print("\n2. Top 3 Positive Correlations with Price:")
for i, (feature, corr) in enumerate(correlations[1:4].items(), 1):
    print(f"   {i}. {feature}: {corr:.4f}")

print("\n3. Top 3 Negative Correlations with Price:")
for i, (feature, corr) in enumerate(correlations[-3:].items(), 1):
    print(f"   {i}. {feature}: {corr:.4f}")

print("\n4. Model Performance:")
print(f"   - Training R² Score: {train_r2:.4f}")
print(f"   - Test R² Score: {test_r2:.4f}")
print(f"   - Test RMSE: {test_rmse:.4f}")
print(f"   - Test MAE: {test_mae:.4f}")

print("\n5. Key Insights:")
print("   - The model explains approximately {:.1f}% of the variance in house prices".format(test_r2 * 100))
print("   - Average prediction error (MAE): ${:.2f}k".format(test_mae))
print("   - The model shows", "good" if abs(train_r2 - test_r2) < 0.05 else "some", "generalization")
print("="*70)