In [1]:
# Surprise Housing â€“ Ridge and Lasso Regression Assignment
# Part I: Model Building

# 1. Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')


# 2. Load the dataset
data =pd.read_csv('https://github.com/tmkotresh/AdvancedRegression-HousePricing/blob/main/train.csv')

# Display basic information
data.head()

# 3. Understanding the problem and the dataset
# The objective is to predict house prices (SalePrice) using available features
# and apply regularization techniques (Ridge and Lasso) to avoid overfitting.

# Shape of the dataset
data.shape

# Check data types and missing values
data.info()

# 4. Data Cleaning
# Checking missing values percentage
missing = data.isnull().mean() * 100
missing.sort_values(ascending=False).head(10)

# Dropping columns with very high missing values (>40%)
high_missing_cols = missing[missing > 40].index
data = data.drop(columns=high_missing_cols)

# Separating numerical and categorical columns
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns

# Imputing numerical columns with median
for col in num_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Imputing categorical columns with mode
for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# 5. Exploratory Data Analysis
# Univariate analysis on target variable
plt.figure()
sns.histplot(data['SalePrice'], kde=True)
plt.show()

# Bivariate analysis between SalePrice and important numerical variables
plt.figure()
sns.scatterplot(x=data['GrLivArea'], y=data['SalePrice'])
plt.show()

# 6. Train-Test Split
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

# One-hot encoding categorical variables
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 7. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 8. Ridge Regression with Hyperparameter Tuning
ridge = Ridge()
params = {'alpha': np.logspace(-3, 3, 50)}

ridge_cv = GridSearchCV(ridge, params, cv=5, scoring='r2')
ridge_cv.fit(X_train_scaled, y_train)

best_ridge_alpha = ridge_cv.best_params_['alpha']

# Evaluate Ridge model
ridge_best = ridge_cv.best_estimator_
y_pred_ridge = ridge_best.predict(X_test_scaled)

ridge_r2 = r2_score(y_test, y_pred_ridge)
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

best_ridge_alpha, ridge_r2, ridge_rmse

# 9. Lasso Regression with Hyperparameter Tuning
lasso = Lasso(max_iter=5000)

lasso_cv = GridSearchCV(lasso, params, cv=5, scoring='r2')
lasso_cv.fit(X_train_scaled, y_train)

best_lasso_alpha = lasso_cv.best_params_['alpha']

# Evaluate Lasso model
lasso_best = lasso_cv.best_estimator_
y_pred_lasso = lasso_best.predict(X_test_scaled)

lasso_r2 = r2_score(y_test, y_pred_lasso)
lasso_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))

best_lasso_alpha, lasso_r2, lasso_rmse

# 10. Feature Importance from Lasso
lasso_coef = pd.Series(lasso_best.coef_, index=X.columns)
important_features = lasso_coef[lasso_coef != 0].sort_values(key=abs, ascending=False)

important_features.head(10)

# End of Part I


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'