In [154]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


Steps to Perform:
1. km_driven and selling_price is right skewed data
2. fill mileage with random data

In [155]:
df = pd.read_csv("./data/car_dataset.csv")

In [156]:
df.sample()

Unnamed: 0.1,Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,company,mileage,seats
9162,11612,i20,2014,600000,100000,Diesel,Individual,Manual,Fourth & Above Owner,hyundai,22.54,5


In [157]:
df.drop(columns={"Unnamed: 0"},inplace=True)

In [158]:
# df["km_driven"] = df["km_driven"].astype("float")

In [159]:
func = FunctionTransformer(func=np.log1p)
df["selling_price"] = func.fit_transform(df["selling_price"])

In [160]:
df["selling_price"]

0        11.002117
1        11.813037
2        13.304687
3        13.017005
4        13.217675
           ...    
10615    12.409018
10616    13.652993
10617    12.388398
10618    15.271798
10619    13.415034
Name: selling_price, Length: 10620, dtype: float64

In [161]:
X = df[["name","year","km_driven","fuel","seller_type","owner","company","mileage","seats"]]
Y = df["selling_price"]

In [162]:
xtrain,xtest,ytrain,ytest = train_test_split(X, Y , test_size=0.2,random_state=42)

In [163]:
xtrain.sample()

Unnamed: 0,name,year,km_driven,fuel,seller_type,owner,company,mileage,seats
1126,swift dzire,2015,135000,Diesel,Individual,First Owner,maruti,26.59,5


In [164]:
logTransform = ColumnTransformer(remainder="passthrough",transformers=[
    ('km_transform',FunctionTransformer(func=np.log1p),[2])
])

In [165]:
xtrf = logTransform.fit_transform(xtrain)

In [166]:
encode_data  = ColumnTransformer(remainder="passthrough", transformers=[
    ("encode_data",OneHotEncoder(sparse_output=False, dtype="int32",drop="first") , [1,3,4,6])
])


In [167]:
orderinal_encode = ColumnTransformer(remainder="passthrough",transformers=[
    ("encode_owner",OrdinalEncoder(categories=[["UnRegistered Car","Fourth & Above Owner","Third Owner","Second Owner","First Owner","Test Drive Car"]]),[193])
])

In [168]:
lr = LinearRegression()

In [169]:
pipe = Pipeline([
    ("km_tranform",logTransform),
    ("encode_one_hot", encode_data),
    ("encode_owner",orderinal_encode),
    ("model",lr)
])

In [170]:
pipe.fit(xtrain,ytrain)

In [171]:

y_pred = pipe.predict(xtest)

print("R2 Score", r2_score(y_true=ytest, y_pred=y_pred) * 100)

R2 Score 90.00123044806769
