In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
data = pd.read_csv('data.csv')

In [3]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210


In [4]:
data.shape

(10000, 14)

# Task 1: Data Preprocessing

#### a) Properly clean the dataset, handle any missing values, and remove outliers.


In [5]:
# Check for missing values
data.isnull().sum()

Age                               0
Education_Level                   0
Occupation                        0
Number_of_Dependents              0
Location                          0
Work_Experience                   0
Marital_Status                    0
Employment_Status                 0
Household_Size                    0
Homeownership_Status              0
Type_of_Housing                   0
Gender                            0
Primary_Mode_of_Transportation    0
Income                            0
dtype: int64

In [6]:
# Remove outliers (For simplicity, let's remove income outliers using IQR method)
Q1 = data['Income'].quantile(0.25)
Q3 = data['Income'].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data['Income'] < (Q1 - 1.5 * IQR)) | (data['Income'] > (Q3 + 1.5 * IQR)))]

In [7]:
# Display the dataset after removing outliers
data.describe()

Unnamed: 0,Age,Number_of_Dependents,Work_Experience,Household_Size,Income
count,8069.0,8069.0,8069.0,8069.0,8069.0
mean,43.973727,2.510596,25.075474,3.987483,115736.080555
std,15.240684,1.717042,14.723198,2.004202,128023.122248
min,18.0,0.0,0.0,1.0,31044.0
25%,31.0,1.0,12.0,2.0,67472.0
50%,44.0,3.0,25.0,4.0,71214.0
75%,57.0,4.0,38.0,6.0,74826.0
max,70.0,5.0,50.0,7.0,772239.0


### b) Perform feature scaling or normalization.

In [8]:
# Defining numerical and categorical columns
numerical_features = ['Age', 'Number_of_Dependents', 'Work_Experience', 'Household_Size']
categorical_features = ['Education_Level', 'Occupation', 'Location', 'Marital_Status', 
                        'Employment_Status', 'Homeownership_Status', 'Type_of_Housing', 'Gender',
                        'Primary_Mode_of_Transportation']

In [9]:
# Create transformers for preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

### c

In [10]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### d

In [12]:
# Apply preprocessing
X = data.drop('Income', axis=1)
y = data['Income']
X_preprocessed = preprocessor.fit_transform(X)

In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Task 2: Model Building with Hyperparameter Tuning

### a) Choose any appropriate regression models to predict the target variable. Justify your choice.


###### Linear Regression, Ridge Regression, Lasso Regression

### b) Implement hyperparameter tuning by conducting a grid search or random search.


In [14]:
# Define hyperparameters for tuning
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

In [15]:
# Initialize grid searches
ridge_grid_search = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='neg_mean_absolute_error')
lasso_grid_search = GridSearchCV(Lasso(), lasso_params, cv=5, scoring='neg_mean_absolute_error')

In [16]:
# Fit grid searches on training data
ridge_grid_search.fit(X_train, y_train)

In [17]:
lasso_grid_search.fit(X_train, y_train)

In [18]:
# Get the best parameters
best_ridge_params = ridge_grid_search.best_params_
best_lasso_params = lasso_grid_search.best_params_

### c) Build the regression models using the training data. Describe the process and provide code snippets.

In [19]:
# Building models with the best hyperparameters
ridge_best = Ridge(**best_ridge_params)
lasso_best = Lasso(**best_lasso_params)

In [20]:
# Fitting models
ridge_best.fit(X_train, y_train)

In [21]:
lasso_best.fit(X_train, y_train)

In [22]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [23]:
# Display the models
print("Best Ridge Regression Model:", ridge_best)

Best Ridge Regression Model: Ridge(alpha=0.01)


In [24]:
print("Best Lasso Regression Model:", lasso_best)

Best Lasso Regression Model: Lasso(alpha=0.01)


In [25]:
print("Linear Regression Model:", linear_model)

Linear Regression Model: LinearRegression()


# Task 3: Model Evaluation and Selection


### a) Evaluate the performance of the regression models using appropriate metrics.

In [26]:
# Predictions
y_pred_linear = linear_model.predict(X_test)
y_pred_ridge = ridge_best.predict(X_test)
y_pred_lasso = lasso_best.predict(X_test)

In [27]:
# Evaluation metrics
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

In [28]:
# Display metrics
print(f"Linear Regression - MAE: {mae_linear}, R2: {r2_linear}")

Linear Regression - MAE: 70799.83581164808, R2: 0.05071776706976916


In [29]:
print(f"Ridge Regression - MAE: {mae_ridge}, R2: {r2_ridge}")

Ridge Regression - MAE: 70807.86760020079, R2: 0.05073381637728236


In [30]:
print(f"Lasso Regression - MAE: {mae_lasso}, R2: {r2_lasso}")

Lasso Regression - MAE: 70807.86909592929, R2: 0.050733812142002455


### b) Implement k-fold cross-validation to assess the model's generalization performance.


In [31]:
# k-fold cross-validation
k = 5
linear_cv_scores = cross_val_score(linear_model, X_train, y_train, cv=k, scoring='neg_mean_absolute_error')
ridge_cv_scores = cross_val_score(ridge_best, X_train, y_train, cv=k, scoring='neg_mean_absolute_error')
lasso_cv_scores = cross_val_score(lasso_best, X_train, y_train, cv=k, scoring='neg_mean_absolute_error')

In [32]:
# Display cross-validation scores
print(f"Linear Regression CV Mean MAE: {-linear_cv_scores.mean()}")

Linear Regression CV Mean MAE: 74820.70000327482


In [33]:
print(f"Ridge Regression CV Mean MAE: {-ridge_cv_scores.mean()}")

Ridge Regression CV Mean MAE: 74794.51023374617


In [34]:
print(f"Lasso Regression CV Mean MAE: {-lasso_cv_scores.mean()}")

Lasso Regression CV Mean MAE: 74794.5089607905


### c) Select the best-performing regression model based on hyperparameter tuning and cross-validation results.


In [35]:
# Based on the evaluation and cross-validation results, select the model with the best performance
if -linear_cv_scores.mean() < -ridge_cv_scores.mean() and -linear_cv_scores.mean() < -lasso_cv_scores.mean():
    best_model = linear_model
    model_name = 'Linear Regression'
elif -ridge_cv_scores.mean() < -lasso_cv_scores.mean():
    best_model = ridge_best
    model_name = 'Ridge Regression'
else:
    best_model = lasso_best
    model_name = 'Lasso Regression'

print(f"The best performing model is: {model_name}")

The best performing model is: Lasso Regression
