# 1. LASSO (Least Absolute Shrinkage and Selection Operator)

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Step 1: Load and preprocess the data
df = pd.read_csv("./HousePrices_Bengaluru.csv")

# Encode the "Location" variable using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['Location'])

# Step 2: Split the data into features and target variable
X = df_encoded.drop('House Price', axis=1)
y = df_encoded['House Price']

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Create the LASSO model
lasso = Lasso(alpha=0.1)  # Specify the desired regularization parameter

# Step 6: Perform feature selection with LASSO
lasso.fit(X_train_scaled, y_train)
selected_features = X_train.columns[lasso.coef_ != 0]

# Step 7: Evaluate the performance of the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Train a regression model on the selected features
# Here, we will use LinearRegression as an example
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train_selected, y_train)

# Evaluate the model's performance on the test data
y_pred = regression_model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)
r_squared = regression_model.score(X_test_selected, y_test)

print("Selected Features:", selected_features)
print("Mean Squared Error:", mse)
print("R-squared:", r_squared)


Selected Features: Index(['Bedrooms', 'Square Footage', 'Age of House', 'Bathrooms',
       'Garage Capacity', 'Overall Condition', 'Location_Rural',
       'Location_Suburban', 'Location_Urban'],
      dtype='object')
Mean Squared Error: 284457452880.0291
R-squared: 0.9989371753803745


In [23]:
df

Unnamed: 0,Bedrooms,Square Footage,Location,Age of House,Bathrooms,Garage Capacity,Overall Condition,House Price
0,5,3116,Rural,44,1,3,4,6.927631e+07
1,1,3773,Suburban,22,3,3,6,8.155192e+07
2,4,3504,Rural,15,2,1,4,7.498880e+07
3,4,2788,Suburban,30,2,1,8,6.150543e+07
4,4,1370,Urban,2,1,1,7,3.069910e+07
...,...,...,...,...,...,...,...,...
1195,2,1650,Suburban,29,4,1,5,3.884493e+07
1196,3,2396,Urban,1,1,3,1,5.293365e+07
1197,5,1063,Urban,47,4,3,6,2.879553e+07
1198,5,1274,Rural,29,4,2,1,3.183113e+07


In [25]:
df_encoded

Unnamed: 0,Bedrooms,Square Footage,Age of House,Bathrooms,Garage Capacity,Overall Condition,House Price,Location_Rural,Location_Suburban,Location_Urban
0,5,3116,44,1,3,4,6.927631e+07,1,0,0
1,1,3773,22,3,3,6,8.155192e+07,0,1,0
2,4,3504,15,2,1,4,7.498880e+07,1,0,0
3,4,2788,30,2,1,8,6.150543e+07,0,1,0
4,4,1370,2,1,1,7,3.069910e+07,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1195,2,1650,29,4,1,5,3.884493e+07,0,1,0
1196,3,2396,1,1,3,1,5.293365e+07,0,0,1
1197,5,1063,47,4,3,6,2.879553e+07,0,0,1
1198,5,1274,29,4,2,1,3.183113e+07,1,0,0


In [26]:
X

Unnamed: 0,Bedrooms,Square Footage,Age of House,Bathrooms,Garage Capacity,Overall Condition,Location_Rural,Location_Suburban,Location_Urban
0,5,3116,44,1,3,4,1,0,0
1,1,3773,22,3,3,6,0,1,0
2,4,3504,15,2,1,4,1,0,0
3,4,2788,30,2,1,8,0,1,0
4,4,1370,2,1,1,7,0,0,1
...,...,...,...,...,...,...,...,...,...
1195,2,1650,29,4,1,5,0,1,0
1196,3,2396,1,1,3,1,0,0,1
1197,5,1063,47,4,3,6,0,0,1
1198,5,1274,29,4,2,1,1,0,0


In [27]:
y

0       6.927631e+07
1       8.155192e+07
2       7.498880e+07
3       6.150543e+07
4       3.069910e+07
            ...     
1195    3.884493e+07
1196    5.293365e+07
1197    2.879553e+07
1198    3.183113e+07
1199    7.163227e+07
Name: House Price, Length: 1200, dtype: float64

In [28]:
X_train_scaled

array([[ 0.68244313,  1.63340458, -1.00868261, ..., -0.74053163,
        -0.70214069,  1.47252057],
       [-0.02582217, -1.63204301,  0.03809093, ..., -0.74053163,
         1.42421598, -0.67910766],
       [-1.44235279, -0.30176131,  1.01507956, ..., -0.74053163,
         1.42421598, -0.67910766],
       ...,
       [-0.02582217, -0.21601144,  0.94529466, ..., -0.74053163,
         1.42421598, -0.67910766],
       [-0.73408748, -0.59029802, -0.24104868, ...,  1.35038121,
        -0.70214069, -0.67910766],
       [-0.02582217, -1.38058732,  0.87550976, ..., -0.74053163,
        -0.70214069,  1.47252057]])

In [29]:
X_test_scaled

array([[-1.44235279, -0.0387178 ,  0.94529466, ..., -0.74053163,
        -0.70214069,  1.47252057],
       [ 0.68244313,  1.43872921, -1.28782222, ...,  1.35038121,
        -0.70214069, -0.67910766],
       [ 0.68244313, -0.39794021,  0.03809093, ...,  1.35038121,
        -0.70214069, -0.67910766],
       ...,
       [-0.73408748, -0.14648453,  0.87550976, ..., -0.74053163,
         1.42421598, -0.67910766],
       [-0.73408748, -1.44779668, -0.93889771, ..., -0.74053163,
         1.42421598, -0.67910766],
       [ 1.39070844,  0.87903752,  0.66615505, ..., -0.74053163,
        -0.70214069,  1.47252057]])

In [30]:
selected_features

Index(['Bedrooms', 'Square Footage', 'Age of House', 'Bathrooms',
       'Garage Capacity', 'Overall Condition', 'Location_Rural',
       'Location_Suburban', 'Location_Urban'],
      dtype='object')

# 2. Ridge Regression

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Step 1: Load and preprocess the data
df = pd.read_csv("./HousePrices_Bengaluru.csv")

# Perform one-hot encoding for the 'Location' variable
df_encoded = pd.get_dummies(df, columns=['Location'])

# Step 2: Split the Data
X = df_encoded.drop('House Price', axis=1)
y = df_encoded['House Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Create and Fit the Ridge Regression Model
alpha = 1.0  # Regularization parameter
ridge_model = Ridge(alpha=alpha)
ridge_model.fit(X_train_scaled, y_train)


# Step 5: Perform feature selection with ridge
selected_features = X_train.columns[ridge_model.coef_ != 0]

# Step 6: Retrieve the Coefficient Estimates
coef_estimates = ridge_model.coef_


# Step 7: Evaluate the Model's Performance
y_pred = ridge_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)


print("Selected Features:", selected_features)
print("Coefficient Estimates:")
for feature, coef in zip(X.columns, coef_estimates):
    print(f"{feature}: {coef}")
print("Mean Squared Error:", mse)


Selected Features: Index(['Bedrooms', 'Square Footage', 'Age of House', 'Bathrooms',
       'Garage Capacity', 'Overall Condition', 'Location_Rural',
       'Location_Suburban', 'Location_Urban'],
      dtype='object')
Coefficient Estimates:
Bedrooms: 153498.41656783584
Square Footage: 17238430.51594144
Age of House: 663054.6600651618
Bathrooms: 109137.3679017924
Garage Capacity: 125727.40749214168
Overall Condition: 70924.3268074453
Location_Rural: -69871.31527922773
Location_Suburban: -32180.1792287167
Location_Urban: 104462.85829632645
Mean Squared Error: 284324612687.67255
R-squared: -9.367155974526572




In [38]:
selected_features

Index(['Bedrooms', 'Square Footage', 'Age of House', 'Bathrooms',
       'Garage Capacity', 'Overall Condition', 'Location_Rural',
       'Location_Suburban', 'Location_Urban'],
      dtype='object')

In [40]:
coef_estimates

array([  153498.41656784, 17238430.51594144,   663054.66006516,
         109137.36790179,   125727.40749214,    70924.32680745,
         -69871.31527923,   -32180.17922872,   104462.85829633])

In [43]:
MSE_Ridge = 284324612687.67255
MSE_Lasso = 284457452880.0291

In [44]:
MSE_Lasso - MSE_Ridge

132840192.35656738

# 3. Elastic Net

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

# Step 1: Load and preprocess the data
df = pd.read_csv("./HousePrices_Bengaluru.csv")

# Perform one-hot encoding for the 'Location' variable
df_encoded = pd.get_dummies(df, columns=['Location'])

# Step 2: Split the Data
# Perform preprocessing
# One-hot encode the 'Location' variable
X = df_encoded.drop('House Price', axis=1)
y = df_encoded['House Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of ElasticNet with desired hyperparameters
elastic_net = ElasticNet(alpha=0.5, l1_ratio=0.5)

# Fit the Elastic Net model to the training data
elastic_net.fit(X_train, y_train)

# Step 6: Retrieve the Coefficient Estimates
coef_estimates = elastic_net.coef_

# Make predictions on the testing data
y_pred = elastic_net.predict(X_test)

# Here, we will use LinearRegression as an example
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
r_squared = regression_model.score(X_test, y_test)

# Optional: Perform feature selection
feature_importances = elastic_net.coef_
important_features = X.columns[feature_importances != 0]
print("Important Features:", important_features)
for feature, coef in zip(X.columns, coef_estimates):
    print(f"{feature}: {coef}")

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r_squared)

Important Features: Index(['Bedrooms', 'Square Footage', 'Age of House', 'Bathrooms',
       'Garage Capacity', 'Overall Condition', 'Location_Rural',
       'Location_Suburban', 'Location_Urban'],
      dtype='object')
Bedrooms: 97524.85167325902
Square Footage: 19996.09373617947
Age of House: 45962.023644170156
Bathrooms: 83819.44172110586
Garage Capacity: 112322.23387473043
Overall Condition: 23154.567480293896
Location_Rural: -85782.42791146031
Location_Suburban: -39444.40767285429
Location_Urban: 125227.82628785842
Mean Squared Error: 301977164074.5871
R-squared: 0.998937175380275


In [52]:
MSE_Ridge = 284324612687.67255
MSE_Lasso = 284457452880.0291
MSE_Eastic = 301977164063.93243

In [53]:
MSE_Ridge - MSE_Eastic

-17652551376.259888

In [54]:
MSE_Lasso - MSE_Eastic

-17519711183.90332