**Import Libraries**

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

**Load Dataset**

In [25]:
file_path = 'Boston-house-price-data.csv'
boston_data = pd.read_csv(file_path)

In [26]:
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


**Missing Values**

In [27]:
missing=boston_data.isnull().sum()
missing

Unnamed: 0,0
CRIM,0
ZN,0
INDUS,0
CHAS,0
NOX,0
RM,0
AGE,0
DIS,0
RAD,0
TAX,0


In [28]:
boston_data = boston_data.drop(columns=['CHAS'], errors='ignore')

boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


**Target Variable**

In [29]:
boston_reg=pd.read_csv('Boston-house-price-data.csv')
boston_reg.dropna(inplace=True)
target = 'MEDV'

**One Hot Coding**

In [30]:
# Import necessary libraries for encoding and scaling
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Identify categorical and numerical columns
categorical_cols = boston_reg.select_dtypes(include=['object', 'category']).columns
numerical_cols = boston_reg.select_dtypes(include=['int64', 'float64']).columns.difference([target])

# Apply One Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = encoder.fit_transform(boston_reg[categorical_cols])

# Create a DataFrame from the encoded data
encoded_columns = encoder.get_feature_names_out(categorical_cols)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns, index=boston_reg.index)

# Combine numerical data and encoded categorical data
X = pd.concat([boston_reg[numerical_cols], df_encoded], axis=1)
y = boston_reg[target]  # Replace 'target' with the actual name of your target column

# Scale the features for better performance of regression models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Check the shape of the processed data
print("Shape of X_scaled:", X_scaled.shape)
print("Shape of y:", y.shape)

Shape of X_scaled: (506, 13)
Shape of y: (506,)


**Split data to train and test**

In [31]:
# Split the data into training and testing sets for classification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

**Linear Regression**

In [32]:
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_linear = linear_model.predict(X_test)

# Evaluate the Linear Regression model
print("Linear Regression Mean Squared Error:", mean_squared_error(y_test, y_pred_linear))
print("Linear Regression R^2 Score:", r2_score(y_test, y_pred_linear))

Linear Regression Mean Squared Error: 21.517444231177297
Linear Regression R^2 Score: 0.711226005748492


**Random Forest**

In [33]:
# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest Regressor
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Mean Squared Error: 9.552910815789474
Random Forest Regressor R^2 Score: 0.8717955448906479


**Random Forest Regressor**

In [34]:
# Create a RandomForestRegressor with regularization parameters
regressor = RandomForestRegressor(
    n_estimators=100,          # Number of trees
    max_depth=10,              # Limit the depth of each tree
    min_samples_split=5,       # Minimum samples required to split an internal node
    min_samples_leaf=4,        # Minimum samples required to be at a leaf node
    max_features='sqrt',       # Use the square root of the total features at each split
    random_state=42
)

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Mean Squared Error: 12.997216490812002
Random Forest Regressor R^2 Score: 0.8717955448906479


In [35]:
# Assuming X and y are your features and target variable
# Define the model with regularization parameters
regressor = RandomForestRegressor(
    n_estimators=300,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42
)

# Set up k-fold cross-validation (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and calculate MSE for each fold
scores = cross_val_score(
    regressor, X, y, cv=kf, scoring=make_scorer(mean_squared_error)
)

# Calculate the mean and standard deviation of the MSE scores
mean_mse = np.mean(scores)
std_mse = np.std(scores)

print(f"Mean MSE from cross-validation: {mean_mse:.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")

Mean MSE from cross-validation: 15.08
R^2 Score: 0.83


**Gradient Boosting Model**

In [36]:
#define the gradient boosting model with different parameters
gbm_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the Gradient Boosting Regressor
print("Gradient Boosting Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_gbm))
print("Gradient Boosting Regressor R^2 Score:", r2_score(y_test, y_pred_gbm))

Gradient Boosting Regressor Mean Squared Error: 7.567632025395746
Gradient Boosting Regressor R^2 Score: 0.8984388989918813


**Ridge, Lasso, Elastic Regression**

In [37]:
# L2 Regularization: Ridge Regression
ridge = Ridge(alpha=1.0)  # alpha controls the regularization strength; higher means more regularization
ridge.fit(X_train, y_train)
ridge_predictions = ridge.predict(X_test)
print("Ridge MSE:", mean_squared_error(y_test, ridge_predictions))
print("Ridge R^2 Score:", r2_score(y_test, ridge_predictions))

# L1 Regularization: Lasso Regression
lasso = Lasso(alpha=0.1)  # alpha is the regularization parameter
lasso.fit(X_train, y_train)
lasso_predictions = lasso.predict(X_test)
print("Lasso MSE:", mean_squared_error(y_test, lasso_predictions))
print("Lasso R^2 Score:", r2_score(y_test, lasso_predictions))

# L1 + L2 Regularization: Elastic Net
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # l1_ratio balances between L1 and L2 (0 = pure L2, 1 = pure L1)
elastic_net.fit(X_train, y_train)
elastic_net_predictions = elastic_net.predict(X_test)
print("Elastic Net MSE:", mean_squared_error(y_test, elastic_net_predictions))
print("Elastic Net R^2 Score:", r2_score(y_test, elastic_net_predictions))

Ridge MSE: 21.54850040295822
Ridge R^2 Score: 0.7108092176450823
Lasso MSE: 22.796793202249255
Lasso R^2 Score: 0.6940565543745827
Elastic Net MSE: 22.47915003259627
Elastic Net R^2 Score: 0.698319471748129


Linear Regression - 0.71

Random Forest Regressor - 0.87

Random Forest Regressor with regularization - 0.83

Gradient Boosting Regressor - 0.89

Ridge Regression - 0.71

Lasso Regression - 0.69

Elastic Regression - 0.69

Gradient Boosting Regressor is working good compared to other models.