In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("C:/Users/Prasad/Downloads/car_price_regularization.csv")
df.head()


Unnamed: 0,Mileage,Age,EngineSize,Price
0,126958,5,3143,235006.4
1,151867,9,1137,86127.6
2,136932,7,1678,151942.6
3,108694,2,1876,246786.2
4,124879,4,1591,192098.2


In [2]:
print(df.isnull().sum())     # Missing values
print(df.duplicated().sum()) # Duplicates

# Drop duplicates if any
df = df.drop_duplicates()


Mileage       0
Age           0
EngineSize    0
Price         0
dtype: int64
0


In [5]:
# feature engineering 
df['Price_per_km'] = df['Price'] / df['Mileage']
bins = [0, 3, 7, 15]   # age groups: 0–3 (new), 4–7 (mid), 8+ (old)
labels = ['New', 'MidAge', 'Old']
df['Age_Category'] = pd.cut(df['Age'], bins=bins, labels=labels)
df

Unnamed: 0,Mileage,Age,EngineSize,Price,Price_per_km,Age_Category
0,126958,5,3143,235006.4,1.851056,MidAge
1,151867,9,1137,86127.6,0.567125,Old
2,136932,7,1678,151942.6,1.109621,MidAge
3,108694,2,1876,246786.2,2.270468,New
4,124879,4,1591,192098.2,1.538275,MidAge
5,115268,9,3064,223019.4,1.93479,Old
6,59886,12,1563,181106.8,3.024193,Old
7,142337,14,3035,127319.6,0.894494,Old
8,173266,2,1179,112708.8,0.650496,New
9,92498,10,1292,167429.4,1.810087,Old


In [12]:
from sklearn.preprocessing import StandardScaler

X=df[['Mileage','Age','EngineSize']]
y=df['Price']

scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)



In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [15]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import numpy as np 

models={ 
    "linear":LinearRegression(),
    'ridge':Ridge(alpha=1.0),
    'lasso':Lasso(alpha=0.1)
}

for name,model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)

    mae=mean_absolute_error(y_test,y_pred)
    rmse=np.sqrt(mean_squared_error(y_test,y_pred))
    r2=r2_score(y_test,y_pred)

    print(f'{name} Regression')
    print('mae',mae)
    print('rmse',rmse)
    print('r2 score',r2)


linear Regression
mae 10510.337580548412
rmse 12567.705455357232
r2 score 0.9717591291305512
ridge Regression
mae 10318.13464973359
rmse 11810.014789069613
r2 score 0.9750616922271241
lasso Regression
mae 10510.337699109543
rmse 12567.649968334372
r2 score 0.9717593784996017


In [None]:
# hyperparameter tuning 
from sklearn.model_selection import GridSearchCV

ridge=Ridge()
params_ridge={'alpha':[0.01,0.1,1,10,100]}
grid_ridge=GridSearchCV(ridge,params_ridge,cv=5,scoring='r2')
grid_ridge.fit(X_train,y_train)

print('best ridge alpha',grid_ridge.best_params)
print('best ridge r2',grid_ridge.best_score_)

lasso=Lasso(max_iter=5000)
params_lasso={'alpha':[0.01,0.1,1,10,100]}
grid_lasso=GridSearch(lasso,params_lasso,cv=5,scoring='r2')
grid_lasso.fit(X_train,y_train)
print('best lasso alpha',grid_lasso.best_params_)
print('best lasso r2',grid_lasso.best_score_)
