In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
boston = datasets.fetch_california_housing()
df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
df['target'] = boston.target  # House price

# Display first few rows
print(df.head())

# Check data summary
print("\nSummary Statistics:\n", df.describe())

# Pairplot for feature relationships
sns.pairplot(df.sample(200), diag_kind="kde")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
# Splitting dataset into train and test sets (80% train, 20% test)
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply K-NN Regressor (Choosing k=5)
knn_regressor = KNeighborsRegressor(n_neighbors=5, metric='euclidean')
knn_regressor.fit(X_train_scaled, y_train)

# Predictions
y_pred = knn_regressor.predict(X_test_scaled)


In [None]:
# Compute evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R² Score): {r2:.2f}")

# Scatter plot of actual vs predicted
plt.figure(figsize=(7, 5))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--r')  # 45-degree reference line
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs. Predicted Prices (K-NN Regression)")
plt.grid()
plt.show()


In [None]:
# Testing different k values
k_values = range(1, 21)
r2_scores = []

for k in k_values:
    knn_reg = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
    knn_reg.fit(X_train_scaled, y_train)
    y_pred = knn_reg.predict(X_test_scaled)
    r2_scores.append(r2_score(y_test, y_pred))

# Plot k vs R² Score
plt.figure(figsize=(8,5))
plt.plot(k_values, r2_scores, marker='o', linestyle='dashed', color='blue')
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("R² Score")
plt.title("Finding Optimal k for K-NN Regression")
plt.xticks(k_values)
plt.grid(True)
plt.show()
