# Linear Regression on Boston dataset


In [1]:

import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Fetch the Boston housing dataset from the original source
data_url = "http://lib.stat.cmu.edu/datasets/boston"

In [2]:
# prompt: top 10 rows of the data

import pandas as pd
import numpy as np
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# Create a Pandas DataFrame for better visualization
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
df = pd.DataFrame(data, columns=feature_names)
df['MEDV'] = target

# Display the top 10 rows
print(df.head(10))

      CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575   65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421   78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185   61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998   45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147   54.2  6.0622  3.0  222.0   
5  0.02985   0.0   2.18   0.0  0.458  6.430   58.7  6.0622  3.0  222.0   
6  0.08829  12.5   7.87   0.0  0.524  6.012   66.6  5.5605  5.0  311.0   
7  0.14455  12.5   7.87   0.0  0.524  6.172   96.1  5.9505  5.0  311.0   
8  0.21124  12.5   7.87   0.0  0.524  5.631  100.0  6.0821  5.0  311.0   
9  0.17004  12.5   7.87   0.0  0.524  6.004   85.9  6.5921  5.0  311.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396

In [3]:
# prompt: how many samples and features does this data have?

print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1] -1}") # Subtract 1 for the target variable

Number of samples: 506
Number of features: 13


In [4]:
# prompt: Build a linear regression model to predict target variable MEDV

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define features (X) and target (y)
X = df[feature_names]
y = df['MEDV']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Print the model's coefficients and intercept
print(f"\nModel Coefficients:")
for feature, coef in zip(feature_names, model.coef_):
  print(f"{feature}: {coef:.4f}")
print(f"Intercept: {model.intercept_:.4f}")


Model Evaluation:
Mean Squared Error (MSE): 24.29
Root Mean Squared Error (RMSE): 4.93
R-squared (R2): 0.67

Model Coefficients:
CRIM: -0.1131
ZN: 0.0301
INDUS: 0.0404
CHAS: 2.7844
NOX: -17.2026
RM: 4.4388
AGE: -0.0063
DIS: -1.4479
RAD: 0.2624
TAX: -0.0106
PTRATIO: -0.9155
B: 0.0124
LSTAT: -0.5086
Intercept: 30.2468


In [5]:
# prompt: Build a linear regression model to predict target variable MEDV using Keras

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the Keras model
keras_model = Sequential([
    Dense(1, input_shape=(X_train.shape[1],))
])

# Compile the model
keras_model.compile(optimizer='adam', loss='mse')

# Train the model
history = keras_model.fit(X_train, y_train, epochs=100, verbose=0)

# Make predictions on the test set
y_pred_keras = keras_model.predict(X_test).flatten()

# Evaluate the model
mse_keras = mean_squared_error(y_test, y_pred_keras)
rmse_keras = np.sqrt(mse_keras)
r2_keras = r2_score(y_test, y_pred_keras)

print(f"\nKeras Model Evaluation:")
print(f"Mean Squared Error (MSE): {mse_keras:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_keras:.2f}")
print(f"R-squared (R2): {r2_keras:.2f}")

# Print the model's weights and bias (coefficients and intercept)
# The single weight corresponds to the coefficients, and the bias is the intercept
print(f"\nKeras Model Weights and Bias:")
weights, bias = keras_model.layers[0].get_weights()
print(f"Weights (Coefficients): {weights.flatten()}")
print(f"Bias (Intercept): {bias[0]}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

Keras Model Evaluation:
Mean Squared Error (MSE): 563.72
Root Mean Squared Error (RMSE): 23.74
R-squared (R2): -6.69

Keras Model Weights and Bias:
Weights (Coefficients): [-0.78929     0.21063288 -0.40714538  0.28147876 -0.3140664  -0.6321782
 -0.37724945  0.48149294 -0.42810455  0.20073998 -0.37020236 -0.02552201
 -0.56949246]
Bias (Intercept): -0.10965115576982498
