In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 1: Load the Dataset (Boston Housing Dataset)
boston = datasets.fetch_california_housing()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target

# Step 2: Exploratory Data Analysis (EDA)
print("First 5 rows of dataset:")
print(df.head())

print("\nDataset Summary:")
print(df.describe())

# Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

# Step 3: Splitting the Dataset into Train and Test
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(criterion='squared_error', max_depth=5, random_state=42)
dt_regressor.fit(X_train, y_train)

# Step 5: Visualizing the Decision Tree
plt.figure(figsize=(12, 8))
plot_tree(dt_regressor, feature_names=boston.feature_names, filled=True)
plt.title("Decision Tree Regression Structure")
plt.show()

# Step 6: Model Evaluation
y_pred = dt_regressor.predict(X_test)
print("Accuracy metrics on test data")
print("\nMean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("R-squared Score (R2):", r2_score(y_test, y_pred))


### Overfitting & Underfitting

In [None]:
# Prediction on test data
y_test_pred = dt_regressor.predict(X_test)
print("Accuracy metrics on test data")
print("\nMean Squared Error (MSE):", mean_squared_error(y_test, y_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_test_pred))
print("R-squared Score (R2):", r2_score(y_test, y_test_pred))


In [None]:
# Prediction on training data
y_train_pred = dt_regressor.predict(X_train)
print("Accuracy metrics on training data")
print("\nMean Squared Error (MSE):", mean_squared_error(y_train, y_train_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_train, y_train_pred))
print("R-squared Score (R2):", r2_score(y_train, y_train_pred))
