In [14]:
!pip install opendatasets
import opendatasets as od
import pandas as pd
od.download("https://www.kaggle.com/datasets/uciml/autompg-dataset")


Skipping, found downloaded files in "./autompg-dataset" (use force=True to force download)


In [15]:
df = pd.read_csv("/content/autompg-dataset/auto-mpg.csv")
df.head(9)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina


In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer
from sklearn.metrics import mean_squared_error

Data Pre-processing

In [17]:
# Replace '?' with NaN , also drop all the missing values
df.replace('?', np.nan, inplace=True)  # Replace '?' with NaN
df.dropna(inplace=True)  # Drop rows with NaN, or you could choose to impute them

#Convert columns to numeric data, to make it more easier
df = df.apply(pd.to_numeric, errors='coerce')

 # Remove the 'car name' column, from the data as required in Q1.b
df.drop('car name', axis=1, inplace=True)

# converting variables to dummy variables
df = pd.get_dummies(df, drop_first=True)

# Separate predictors and target variable(response)
X = df.drop('mpg', axis=1) #Separation
y = df['mpg'] #Target variable(response)

In [18]:
# Initialize the L2 normalizer
normalizer = Normalizer(norm='l2')

# Normalize the features
X_normalized = normalizer.fit_transform(X)


Training linear regression model:

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=1/3, random_state=42)

# Train the multivariate Linear Regression (LR) model
model = LinearRegression()
model.fit(X_train, y_train)

# Retrieve the coefficient for the "year" attribute (required for Q1.b)
#Assumption that the 'model_year' is the column name for the year attribute in the dataset
year_index = X.columns.get_loc('model year')

#Coefficient for the 'year' attribute
year_coef = model.coef_[year_index]

#Evaluation of the model
# Predict on the test data
y_pred = model.predict(X_test)

#On the test data, calculate the mean squared error (mse)
mse = mean_squared_error(y_test, y_pred)

#Print the coefficient for the "year" attribute and the MSE
print(f'Coefficient for the "year" attribute is {year_coef}')
print(f'Mean Squared Error on the test data obtained is {mse}')

Coefficient for the "year" attribute is 1028.8831105463826
Mean Squared Error on the test data obtained is 11.675987358121692


Q 1 (c)

In [25]:
from sklearn.linear_model import RidgeCV, LassoCV

alphavalues = np.logspace(-6, 6, 13)

#Applying the RidgeCV regression with cross-validation
ridge_cv = RidgeCV(alphas=alphavalues, store_cv_values=True)
ridge_cv.fit(X_train, y_train)

#Applying the LassoCV regression with cross-validation
lasso_cv = LassoCV(alphas=alphavalues, cv=5, random_state=42)
lasso_cv.fit(X_train, y_train)

#Get the coefficients from RidgeCV and LassoCV
ridgecoeffs = ridge_cv.coef_
lassocoeffs = lasso_cv.coef_

#Recreate the feature names with the exception of "mpg" and "car name" from the original dataset.
feature_names = X.columns

#Compare both the coefficients
coefficientsComparison = pd.DataFrame({
    'Attribute': feature_names,
    'Ridge Coefficients': ridgecoeffs,
    'Lasso Coefficients': lassocoeffs
})

coefficientsComparison


Unnamed: 0,Attribute,Ridge Coefficients,Lasso Coefficients
0,cylinders,-928.305619,-0.0
1,displacement,-111.01245,-62.923862
2,horsepower,-375.19116,-303.843657
3,weight,-1638.845247,-759.126069
4,acceleration,-912.420685,-673.545681
5,model year,1019.498752,992.092226
6,origin,444.682808,0.0


Q 1 (d)

In [26]:
from sklearn.metrics import mean_squared_error

#Predict using the best Ridge model based on the test data.
ridge_pred = ridge_cv.predict(X_test)
mse_ridge = mean_squared_error(y_test, ridge_pred)

#Predict using the best Lasso model based on the test data.
lasso_pred = lasso_cv.predict(X_test)
mse_lasso = mean_squared_error(y_test, lasso_pred)

#We must retrain and use a standard linear regression model to predict in order to make a comparison.
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, lr_pred)

#Collect Ridge and Lasso's best alphas.
ridge_bestalphas = ridge_cv.alpha_
lasso_bestalphas = lasso_cv.alpha_

#Get the results, by comparing all three
comparison_Of_mse = {
    "Linear Regression MSE": mse_lr,
    "Ridge MSE": mse_ridge,
    "Lasso MSE": mse_lasso
}

#Display the comparison
print(comparison_Of_mse)

#For Ridge and Lasso, print the best alpha values
print(f"Using Ridge, the best alpha value is {ridge_bestalphas}")
print(f"Using Ridge, the best alpha value is {lasso_bestalphas}")

#For comparison, obtain the linear regression's coefficients
lrcoeffs = lr_model.coef_

#To the comparison DataFrame, add the linear regression coefficients.
coefficientsComparison['Linear Regression Coefficients'] = lrcoeffs

#We can now evaluate the MSE and the coefficients to comprehend the effects of regularization.
print(coefficientsComparison)

{'Linear Regression MSE': 11.675987358121692, 'Ridge MSE': 11.667872670005153, 'Lasso MSE': 11.859433549213179}
Using Ridge, the best alpha value is 1e-05
Using Ridge, the best alpha value is 0.0001
      Attribute  Ridge Coefficients  Lasso Coefficients  \
0     cylinders         -928.305619           -0.000000   
1  displacement         -111.012450          -62.923862   
2    horsepower         -375.191160         -303.843657   
3        weight        -1638.845247         -759.126069   
4  acceleration         -912.420685         -673.545681   
5    model year         1019.498752          992.092226   
6        origin          444.682808            0.000000   

   Linear Regression Coefficients  
0                    -1833.927921  
1                     -180.022103  
2                     -452.984607  
3                    -2792.962715  
4                    -1102.508959  
5                     1028.883111  
6                      728.409523  
