In [1]:
# Part A, Task 1, a, Properly clean the dataset, handle any missing values, and remove outliers.

import pandas as pd

# Load the dataset
file_path = 'car.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
df.head()


Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [3]:
df.describe(include='all')

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
count,6019,6019,6019.0,6019.0,6019,6019,6019,6017,5983,5983,5977.0,824,6019.0
unique,1876,11,,,5,2,4,442,146,372,,540,
top,Mahindra XUV500 W8 2WD,Mumbai,,,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,,95.13 Lakh,
freq,49,790,,,3205,4299,4929,172,606,235,,6,
mean,,,2013.358199,58738.38,,,,,,,5.278735,,9.479468
std,,,3.269742,91268.84,,,,,,,0.80884,,11.187917
min,,,1998.0,171.0,,,,,,,0.0,,0.44
25%,,,2011.0,34000.0,,,,,,,5.0,,3.5
50%,,,2014.0,53000.0,,,,,,,5.0,,5.64
75%,,,2016.0,73000.0,,,,,,,5.0,,9.95


In [4]:
import numpy as np

# Handling wrong data type

# Cleaning Mileage, Engine, and Power columns to retain only numeric values
df['Mileage'] = df['Mileage'].str.extract('(\d+\.\d+|\d+)').astype(float)
df['Engine'] = df['Engine'].str.extract('(\d+\.\d+|\d+)').astype(float)
df['Power'] = df['Power'].str.extract('(\d+\.\d+|\d+)').astype(float)

  df['Mileage'] = df['Mileage'].str.extract('(\d+\.\d+|\d+)').astype(float)
  df['Engine'] = df['Engine'].str.extract('(\d+\.\d+|\d+)').astype(float)
  df['Power'] = df['Power'].str.extract('(\d+\.\d+|\d+)').astype(float)


In [5]:
# Converting Seats to integer type
df['Seats'] = df['Seats'].fillna(0).astype(int)

In [6]:
# Handling missing values

# For Mileage, Engine, Power - Imputing missing values with the median
df['Mileage'].fillna(df['Mileage'].median(), inplace=True)
df['Engine'].fillna(df['Engine'].median(), inplace=True)
df['Power'].fillna(df['Power'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Mileage'].fillna(df['Mileage'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine'].fillna(df['Engine'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [7]:
# Seats - Imputing missing values with the mode
df['Seats'].replace(0, np.nan, inplace=True)  # replace previously filled 0s with NaN
df['Seats'].fillna(df['Seats'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Seats'].replace(0, np.nan, inplace=True)  # replace previously filled 0s with NaN


In [8]:
# Dropping the New_Price column as it has too many missing values
df.drop(columns=['New_Price'], inplace=True)

In [9]:
# Identifying and removing outliers
# Using IQR method for outliers detection
Q1 = df[['Kilometers_Driven', 'Price']].quantile(0.25)
Q3 = df[['Kilometers_Driven', 'Price']].quantile(0.75)
IQR = Q3 - Q1

# Filtering out the outliers
df_cleaned = df[~((df[['Kilometers_Driven', 'Price']] < (Q1 - 1.5 * IQR)) |(df[['Kilometers_Driven', 'Price']] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [10]:
# Final overview of the cleaned data
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5101 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               5101 non-null   object 
 1   Location           5101 non-null   object 
 2   Year               5101 non-null   int64  
 3   Kilometers_Driven  5101 non-null   int64  
 4   Fuel_Type          5101 non-null   object 
 5   Transmission       5101 non-null   object 
 6   Owner_Type         5101 non-null   object 
 7   Mileage            5101 non-null   float64
 8   Engine             5101 non-null   float64
 9   Power              5101 non-null   float64
 10  Seats              5101 non-null   float64
 11  Price              5101 non-null   float64
dtypes: float64(5), int64(2), object(5)
memory usage: 518.1+ KB


In [11]:
df_cleaned.describe(include='all')

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
count,5101,5101,5101.0,5101.0,5101,5101,5101,5101.0,5101.0,5101.0,5101.0,5101.0
unique,1613,11,,,5,2,4,,,,,
top,Mahindra XUV500 W8 2WD,Mumbai,,,Petrol,Manual,First,,,,,
freq,48,696,,,2628,4085,4168,,,,,
mean,,,2013.233288,55090.53911,,,,18.690759,1489.366203,100.046283,5.251519,6.114303
std,,,3.249386,27113.462795,,,,4.426534,480.01791,37.801912,0.738613,3.993148
min,,,1998.0,171.0,,,,0.0,72.0,34.2,2.0,0.44
25%,,,2011.0,35000.0,,,,16.02,1197.0,74.0,5.0,3.25
50%,,,2014.0,54000.0,,,,18.7,1396.0,88.73,5.0,5.07
75%,,,2016.0,72000.0,,,,21.56,1598.0,118.0,5.0,7.67


In [12]:
# Part A, Task 1, b: Perform Feature Scaling or Normalization

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Selecting the numeric columns to scale
numeric_columns = ['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']

# Creating a copy of the cleaned dataframe to apply scaling
df_scaled = df_cleaned.copy()

# Normalization (Min-Max Scaling)
min_max_scaler = MinMaxScaler()
df_scaled[numeric_columns] = min_max_scaler.fit_transform(df_scaled[numeric_columns])

# Standardization (Z-score Scaling):
# scaler = StandardScaler()
# df_scaled[numeric_columns] = scaler.fit_transform(df_scaled[numeric_columns])

# Displaying the first few rows of the scaled data
print("Normalized Data (using Min-Max Scaling):")
print(df_scaled.head())


Normalized Data (using Min-Max Scaling):
                               Name    Location      Year  Kilometers_Driven  \
0            Maruti Wagon R LXI CNG      Mumbai  0.571429           0.549030   
1  Hyundai Creta 1.6 CRDi SX Option        Pune  0.809524           0.312079   
2                      Honda Jazz V     Chennai  0.619048           0.350297   
3                 Maruti Ertiga VDI     Chennai  0.666667           0.663683   
4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  0.714286           0.309557   

  Fuel_Type Transmission Owner_Type   Mileage    Engine     Power  Seats  \
0       CNG       Manual      First  0.793083  0.195606  0.051438  0.375   
1    Diesel       Manual      First  0.586464  0.318969  0.197510  0.375   
2    Petrol       Manual      First  0.542636  0.238065  0.117003  0.375   
3    Diesel       Manual      First  0.619261  0.248416  0.117132  0.625   
4    Diesel    Automatic     Second  0.453190  0.400507  0.228854  0.375   

      Price  
0  0.06

In [13]:
# Part A, Task 1, c, Appropriately encode categorical variables.

# Importing necessary libraries for encoding
from sklearn.preprocessing import OneHotEncoder

# Selecting the categorical columns to encode
categorical_columns = ['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

# Applying One-Hot Encoding to the categorical columns
df_encoded = pd.get_dummies(df_scaled, columns=categorical_columns, drop_first=True)

# Displaying the first few rows of the encoded dataset
df_encoded.head()


Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Name_Audi A3 35 TDI Attraction,Name_Audi A3 35 TDI Premium,Name_Audi A3 35 TDI Premium Plus,...,Location_Mumbai,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Manual,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,0.571429,0.54903,0.793083,0.195606,0.051438,0.375,0.068372,False,False,False,...,True,False,False,False,False,False,True,False,False,False
1,0.809524,0.312079,0.586464,0.318969,0.19751,0.375,0.629436,False,False,False,...,False,True,True,False,False,False,True,False,False,False
2,0.619048,0.350297,0.542636,0.238065,0.117003,0.375,0.2119,False,False,False,...,False,False,False,False,False,True,True,False,False,False
3,0.666667,0.663683,0.619261,0.248416,0.117132,0.625,0.290188,False,False,False,...,False,False,True,False,False,False,True,False,False,False
4,0.714286,0.309557,0.45319,0.400507,0.228854,0.375,0.902923,False,False,False,...,False,False,True,False,False,False,False,False,True,False


In [14]:
#Part A, Task 1, d, Split the dataset into training and testing sets.

# Importing the necessary library for splitting the dataset
from sklearn.model_selection import train_test_split

# Defining the target variable (Price) and the features (all other columns)
X = df_encoded.drop(columns=['Price'])
y = df_encoded['Price']

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying the shapes of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((4080, 1636), (1021, 1636), (4080,), (1021,))

In [15]:
# Part A, Task 2, a, Choose any appropriate regression models (e.g., Linear Regression, Ridge Regression, Lasso Regression) to predict the target variable. Justify your choice.

# Importing necessary libraries for regression models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Initializing the models
linear_reg = LinearRegression()
ridge_reg = Ridge()

# Training the Linear Regression model
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)

# Training the Ridge Regression model
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)

# Calculating performance metrics for both models
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

(mse_linear, r2_linear), (mse_ridge, r2_ridge)


((8.40164467186857e+21, -1.8596637917874005e+23),
 (0.006959384863361868, 0.845957350626339))

In [16]:
# Part A, Task 2, b, Implement hyperparameter tuning by conducting a grid search or random search to optimize model parameters. Clearly outline the hyperparameters you tuned and the rationale behind them. (8 Marks)

from sklearn.model_selection import GridSearchCV

# Defining the parameter grid for Ridge regression
param_grid = {'alpha': [0.1, 1, 10, 100, 1000]}

# Initializing the Ridge regression model
ridge_reg = Ridge()

# Performing Grid Search with cross-validation
grid_search = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Getting the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


({'alpha': 0.1}, 0.8421686094952211)

In [17]:
# Part A, Task 2, c, Build the regression models using the training data. Describe the process and provide code snippets.

# Import necessary libraries
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Initializing the Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)  # Train the model on the training data

# Initializing the Ridge Regression model with the best alpha value from Grid Search
ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(X_train, y_train)  # Train the model on the training data

# Predictions using the test set
y_pred_linear = linear_reg.predict(X_test)
y_pred_ridge = ridge_reg.predict(X_test)

# Evaluating the performance of the models
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Linear Regression - MSE: {mse_linear}, R²: {r2_linear}")
print(f"Ridge Regression - MSE: {mse_ridge}, R²: {r2_ridge}")


Linear Regression - MSE: 8.40164467186857e+21, R²: -1.8596637917874005e+23
Ridge Regression - MSE: 0.006774004679480238, R²: 0.8500606521719742


In [18]:
# Part A, Task 3, a, Evaluate the performance of the regression models using appropriate metrics (e.g., Mean Absolute Error, R-squared). Calculate and interpret these metrics.

# Importing necessary metrics
from sklearn.metrics import mean_absolute_error

# Calculating MAE for both models
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)

# Displaying the performance metrics for both models
{
    'Linear Regression': {
        'MSE': mse_linear,
        'R²': r2_linear,
        'MAE': mae_linear
    },
    'Ridge Regression': {
        'MSE': mse_ridge,
        'R²': r2_ridge,
        'MAE': mae_ridge
    }
}


{'Linear Regression': {'MSE': 8.40164467186857e+21,
  'R²': -1.8596637917874005e+23,
  'MAE': 18406905344.060722},
 'Ridge Regression': {'MSE': 0.006774004679480238,
  'R²': 0.8500606521719742,
  'MAE': 0.04911413031387376}}

In [21]:
# Part A, Task 3, b, Implement k-fold cross-validation (e.g., 5-fold or 10-fold) to assess the model's generalization performance using multiple splits.

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge

# Applying 5-fold cross-validation on both Linear Regression and Ridge Regression models

# Perform 5-fold cross-validation on Linear Regression model
cv_scores_linear = cross_val_score(LinearRegression(), X, y, cv=5, scoring='r2')

# Perform 5-fold cross-validation on Ridge Regression model (alpha=0.1)
cv_scores_ridge = cross_val_score(Ridge(alpha=0.1), X, y, cv=5, scoring='r2')

# Calculating mean and std for both models
cv_mean_linear = cv_scores_linear.mean()
cv_std_linear = cv_scores_linear.std()

cv_mean_ridge = cv_scores_ridge.mean()
cv_std_ridge = cv_scores_ridge.std()

# Displaying results
print(f"Linear Regression: Mean R² = {cv_mean_linear}, Std R² = {cv_std_linear}")
print(f"Ridge Regression: Mean R² = {cv_mean_ridge}, Std R² = {cv_std_ridge}")


Linear Regression: Mean R² = -3.924449581586473e+23, Std R² = 1.886228712198165e+23
Ridge Regression: Mean R² = 0.8504307065924218, Std R² = 0.012266929590092238


In [None]:
# Part B, Task 1, a, Clean the dataset by handling missing values and removing outliers as needed.

