In [34]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

In [35]:
data = pd.read_csv('car_price_prediction_updated.csv')
data.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age
0,13328,1399.0,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Left wheel,Silver,12,15
1,16621,1018.0,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,14
2,8467,906.838128,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Right-hand drive,Black,2,19
3,3607,862.0,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,Left wheel,White,0,14
4,11726,446.0,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Left wheel,Silver,4,11


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Levy              19237 non-null  float64
 2   Manufacturer      19237 non-null  object 
 3   Model             19237 non-null  object 
 4   Category          19237 non-null  object 
 5   Leather interior  19237 non-null  object 
 6   Fuel type         19237 non-null  object 
 7   Engine volume     19237 non-null  object 
 8   Mileage           19237 non-null  int64  
 9   Cylinders         19237 non-null  float64
 10  Gear box type     19237 non-null  object 
 11  Drive wheels      19237 non-null  object 
 12  Wheel             19237 non-null  object 
 13  Color             19237 non-null  object 
 14  Airbags           19237 non-null  int64  
 15  Age               19237 non-null  int64  
dtypes: float64(2), int64(4), object(10)
memo

In [37]:
X = data.drop(['Price','Engine volume'] , axis=1)
y = data['Price']

In [38]:

numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns

skewness = data[numeric_cols].skew().sort_values(ascending=False)
skewness

Price        136.470427
Mileage       38.896821
Levy           6.525337
Cylinders      2.091083
Age            2.082261
Airbags        0.082012
dtype: float64

In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

one_hot = ['Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Color']

power_transform = ['Levy', 'Mileage', 'Cylinders', 'Age']

standard_transform = ['Airbags']

ordinal_tranform = ['Wheel']

binary_tranform = ['Leather interior']

high_cardinal = ['Manufacturer', 'Model']



standard_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

power_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson'))
])

one_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(
        drop='first',
        handle_unknown='ignore',
        sparse_output=False
    ))
])

binary_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['No', 'Yes']], dtype=int))
])

ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        categories=[['Left wheel', 'Right-hand drive']],
        dtype=int
    ))
])


In [40]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        for col in X.columns:
            X[col] = X[col].map(self.freq_maps[col]).fillna(0)
        return X.values


high_card_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('freq', FrequencyEncoder())
])

In [41]:
transformer = ColumnTransformer([
    ('standard', standard_pipe, standard_transform),
    ('power', power_pipe, power_transform),
    ('onehot', one_pipe, one_hot),
    ('binary', binary_pipe, binary_tranform),
    ('ordinal', ordinal_pipe, ordinal_tranform),
    ('high_card', high_card_pipe, high_cardinal)
], remainder='passthrough')


In [42]:
y = np.log1p(data['Price'])

In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge, RidgeCV, LassoCV
from sklearn.svm import SVR

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    X,y, test_size=0.3,
    random_state=42)

In [45]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "SVR": SVR(),
    "RidgeCV": RidgeCV(),
    "LassoCV": LassoCV()
}

for name, model in models.items():

    pipe = Pipeline([
        ('preprocess', transformer),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_test_pred_log  = pipe.predict(X_test)

    y_test_pred  = np.expm1(y_test_pred_log)

    y_test_real  = np.expm1(y_test)

    print(name)
    print("Test  MAE:", mean_absolute_error(y_test_real, y_test_pred))
    print("Test  R2 :", r2_score(y_test_real, y_test_pred))
    print("="*40)

LinearRegression
Test  MAE: 10390.781335361442
Test  R2 : 0.0617239869690418
Lasso
Test  MAE: 12462.094899723163
Test  R2 : -0.21570396379569834
ElasticNet
Test  MAE: 12462.094899723163
Test  R2 : -0.21570396379569834
Ridge
Test  MAE: 10387.683927436108
Test  R2 : 0.06362088231899243
SVR
Test  MAE: 6345.261005056854
Test  R2 : 0.44274730250778405
RidgeCV
Test  MAE: 10389.581503983254
Test  R2 : 0.06209445185898521
LassoCV
Test  MAE: 10382.241206286792
Test  R2 : 0.06411370218787449
