In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from autoviz.AutoViz_Class import AutoViz_Class

In [45]:
df = pd.read_csv('housing.csv')

In [46]:
# profile = ProfileReport(df,title='Profiling Report')
# profile.to_file("Report.html")

In [47]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [48]:
# AV = AutoViz_Class().AutoViz(filename='',dfte=df,depVar='median_house_value')
# %matplotlib inline

In [162]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer,TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error,r2_score
from sklearn.preprocessing import OneHotEncoder,FunctionTransformer,StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

In [50]:
def model_scoring(model,X_train,X_test,y_train,y_test):
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test R² Score: {r2:.4f}")

    y_pred = model.predict(X_train)
    rmse = root_mean_squared_error(y_train,y_pred)
    r2 = r2_score(y_train,y_pred)

    print(f"\nTrain RMSE: {rmse:.2f}")
    print(f"Train R² Score: {r2:.4f}")

#### Linear Regression - 1
##### Imputing with Mean, Standard Scaling & Onehot encoding

In [51]:
def linear_1(data):
    df = data.copy()
    
    target = 'median_house_value'
    X = df.drop(columns=target)
    y = df[target]

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    numerical_features = df.select_dtypes(include=np.number).drop(columns=target).columns
    categorical_features = df.select_dtypes(include='object').columns
    
    numerical_pipeline = Pipeline([
        ('imputer',SimpleImputer(strategy='mean')),
        ('scaler',StandardScaler())
    ])

    catergorical_pipeline = Pipeline([
        ('encoder',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num',numerical_pipeline,numerical_features),
        ('cat',catergorical_pipeline,categorical_features)
    ])

    model = Pipeline([
        ('preprocessor',preprocessor),
        ('model',LinearRegression())
    ])

    model.fit(X_train,y_train)

    model_scoring(model,X_train,X_test,y_train,y_test)

In [52]:
linear_1(df)

Test RMSE: 68780.07
Test R² Score: 0.6396

Train RMSE: 68763.97
Train R² Score: 0.6470


#### Linear Regression - 2
##### Imputing with Mean, Standard scaling, Log Transform, Category modification, Onehot encoding & New feature creation                                     


In [None]:
def linear_2(data):
    df = data.copy()
    
    df['ocean_proximity'] = df['ocean_proximity'].replace('ISLAND','NEAR OCEAN')
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']

    target = 'median_house_value'
    X = df.drop(columns=target)
    y = df[target]

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    numerical_features = df.select_dtypes(include=np.number).drop(columns=target).columns
    categorical_features = df.select_dtypes(include='object').columns
    
    numerical_pipeline = Pipeline([
        ('imputer',SimpleImputer(strategy='mean')),
        ('scaler',StandardScaler())
    ])

    catergorical_pipeline = Pipeline([
        ('encoder',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num',numerical_pipeline,numerical_features),
        ('cat',catergorical_pipeline,categorical_features)
    ])

    model = Pipeline([
        ('preprocessor',preprocessor),
        ('model',LinearRegression())
    ])

    model.fit(X_train,y_train)

    model_scoring(model,X_train,X_test,y_train,y_test)


In [54]:
linear_2(df)

Test RMSE: 68016.25
Test R² Score: 0.6475

Train RMSE: 67914.73
Train R² Score: 0.6557


#### Linear Regression - 3
##### Imputing with Mean, Standard scaling, Category modification, Onehot encoding, New feature creation & dropping original columns after creating new features         

In [55]:
def linear_3(data):
    df = data.copy()
    
    df['ocean_proximity'] = df['ocean_proximity'].replace('ISLAND','NEAR OCEAN')
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']

    target_and_original = ['median_house_value','total_rooms','total_bedrooms','population']
    target = 'median_house_value'
    X = df.drop(columns=target_and_original)
    y = df[target]

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    numerical_features = df.select_dtypes(include=np.number).drop(columns=target_and_original).columns
    categorical_features = df.select_dtypes(include='object').columns
    
    numerical_pipeline = Pipeline([
        ('imputer',SimpleImputer(strategy='mean')),
        ('scaler',StandardScaler())
    ])

    catergorical_pipeline = Pipeline([
        ('encoder',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num',numerical_pipeline,numerical_features),
        ('cat',catergorical_pipeline,categorical_features)
    ])

    model = Pipeline([
        ('preprocessor',preprocessor),
        ('model',LinearRegression())
    ])

    model.fit(X_train,y_train)

    model_scoring(model,X_train,X_test,y_train,y_test)


In [56]:
linear_3(df)

Test RMSE: 70519.34
Test R² Score: 0.6211

Train RMSE: 70383.91
Train R² Score: 0.6302


#### Linear Regression - 4
##### Imputing with Mean, Standard scaling, Log Transform, Category modification, Onehot encoding, New feature creation & dropping original columns after creating new features    
##### Log transform made it worse, not much of a change with and without dropping the original columns

In [105]:
def linear_4(data):
    df = data.copy()
    
    df['ocean_proximity'] = df['ocean_proximity'].replace('ISLAND','NEAR OCEAN')
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']

    target_and_original = ['median_house_value','total_rooms','total_bedrooms','population']
    target = 'median_house_value'
    X = df.drop(columns=target_and_original)
    y = df[target]

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    numerical_features = df.select_dtypes(include=np.number).drop(columns=target_and_original).columns
    categorical_features = df.select_dtypes(include='object').columns

    log_t = FunctionTransformer(func=np.log1p,inverse_func=np.expm1,validate=False)
    
    numerical_pipeline = Pipeline([
        ('imputer',SimpleImputer(strategy='mean')),
        ('scaler',StandardScaler())
    ])

    catergorical_pipeline = Pipeline([
        ('encoder',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num',numerical_pipeline,numerical_features),
        ('cat',catergorical_pipeline,categorical_features)
    ])

    regression_pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('model',LinearRegression())
    ])

    full_model = TransformedTargetRegressor(
        regressor=regression_pipeline,
        func=log_t.transform,
        inverse_func=log_t.inverse_transform
    )

    full_model.fit(X_train,y_train)

    model_scoring(full_model,X_train,X_test,y_train,y_test)


In [106]:
linear_4(df)

Test RMSE: 92930.32
Test R² Score: 0.3420

Train RMSE: 93345.42
Train R² Score: 0.3496


#### Regularized Linear Regression

In [108]:
def linear_regularized(data, model_type='ridge', alpha=1.0, l1_ratio=0.5):
    df = data.copy()

    # Feature engineering
    df['ocean_proximity'] = df['ocean_proximity'].replace('ISLAND','NEAR OCEAN')
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']

    target_and_original = ['median_house_value','total_rooms','total_bedrooms','population']
    target = 'median_house_value'
    X = df.drop(columns=target_and_original)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    numerical_features = df.select_dtypes(include=np.number).drop(columns=target_and_original).columns
    categorical_features = df.select_dtypes(include='object').columns

    log_t = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=False)

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    catergorical_pipeline = Pipeline([
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', catergorical_pipeline, categorical_features)
    ])

    # Choose regularized model
    if model_type == 'ridge':
        model = Ridge(alpha=alpha)
    elif model_type == 'lasso':
        model = Lasso(alpha=alpha)
    elif model_type == 'elasticnet':
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    else:
        raise ValueError("model_type must be one of 'ridge', 'lasso', or 'elasticnet'")

    regression_pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('model',model)
    ])

    regression_pipeline.fit(X_train,y_train)
    
    model_scoring(regression_pipeline,X_train,X_test,y_train,y_test)

In [110]:
linear_regularized(df, model_type='ridge', alpha=1.0, l1_ratio=0.5)

Test RMSE: 70519.12
Test R² Score: 0.6211

Train RMSE: 70383.91
Train R² Score: 0.6302


In [111]:
linear_regularized(df, model_type='lasso', alpha=1.0, l1_ratio=0.5)

Test RMSE: 70519.33
Test R² Score: 0.6211

Train RMSE: 70383.91
Train R² Score: 0.6302


#### Regularized Linear Regression - GridSearchCV

In [160]:
def linear_regularized_grid(data, model_type='ridge'):
    df = data.copy()

    # Feature engineering
    df['ocean_proximity'] = df['ocean_proximity'].replace('ISLAND','NEAR OCEAN')
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']

    target_and_original = ['median_house_value','total_rooms','total_bedrooms','population']
    target = 'median_house_value'
    X = df.drop(columns=target_and_original)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    numerical_features = df.select_dtypes(include=np.number).drop(columns=target_and_original).columns
    categorical_features = df.select_dtypes(include='object').columns

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    catergorical_pipeline = Pipeline([
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', catergorical_pipeline, categorical_features)
    ])

    # Choose model and param grid
    if model_type == 'ridge':
        model = Ridge()
        param_grid = {'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
    elif model_type == 'lasso':
        model = Lasso(max_iter=10000)
        param_grid = {'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]}
    elif model_type == 'elasticnet':
        model = ElasticNet(max_iter=10000)
        param_grid = {
            'model__alpha': [0.01, 0.1, 1.0, 10.0],
            'model__l1_ratio': [0.1, 0.5, 0.9]
        }
    else:
        raise ValueError("model_type must be one of 'ridge', 'lasso', or 'elasticnet'")

    regresssion_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    grid_search = GridSearchCV(regresssion_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error')
    grid_search.fit(X_train, y_train)

    print(f"\nBest Parameters for {model_type}: {grid_search.best_params_}")
    print(f"Best RMSE (CV): {-grid_search.best_score_:.4f}")

    best_model = grid_search.best_estimator_
    model_scoring(best_model, X_train, X_test, y_train, y_test)


In [161]:
linear_regularized_grid(df,'ridge')


Best Parameters for ridge: {'model__alpha': 10.0}
Best RMSE (CV): 70507.2428
Test RMSE: 70517.44
Test R² Score: 0.6211

Train RMSE: 70384.31
Train R² Score: 0.6302


In [122]:
linear_regularized_grid(df,'lasso')


Best Parameters for lasso: {'model__alpha': 10.0}
Best RMSE (CV): 70506.5815
[-51513.80613399 -49592.94098594  14471.23076963   9210.09967137
  81175.491992     9257.22359282  19016.52249349  -3883.74692501
 -36346.00828469    171.89416995   8875.58467125]
Test RMSE: 70519.31
Test R² Score: 0.6211

Train RMSE: 70383.95
Train R² Score: 0.6302


In [123]:
linear_regularized_grid(df,'elasticnet')


Best Parameters for elasticnet: {'model__alpha': 0.01, 'model__l1_ratio': 0.9}
Best RMSE (CV): 70507.2218
[-50390.31449113 -48466.12073846  14490.08523663   9228.73206814
  81120.55332975   9197.98574121  18938.65335981  -3888.66077718
 -36853.20565761    523.39703478   9141.72186679]
Test RMSE: 70516.82
Test R² Score: 0.6211

Train RMSE: 70384.72
Train R² Score: 0.6302
