In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


Steps to Perform:
1. km_driven and selling_price is right skewed data
2. fill mileage with random data

In [20]:
df = pd.read_csv("./data/car_dataset.csv")

In [21]:
df.sample()

Unnamed: 0.1,Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,company,mileage,seats
10348,13456,others,2020,690000,55000,Petrol,Individual,Manual,First Owner,Toyota,11.2,5


In [4]:
df.drop(columns={"Unnamed: 0"},inplace=True)

In [158]:
# df["km_driven"] = df["km_driven"].astype("float")

In [22]:
func = FunctionTransformer(func=np.log1p)
df["selling_price"] = func.fit_transform(df["selling_price"])

In [23]:
df["selling_price"]

0        11.002117
1        11.813037
2        13.304687
3        13.017005
4        13.217675
           ...    
10615    12.409018
10616    13.652993
10617    12.388398
10618    15.271798
10619    13.415034
Name: selling_price, Length: 10620, dtype: float64

In [24]:

def add_others(feature_name, threshold):
    counts = df[feature_name].value_counts()
    unique = df[feature_name].nunique()

    repl = counts[counts <= threshold].index
    df[feature_name] = df[feature_name].replace(repl,'others')


add_others("company",10)
add_others("name",15) 

In [25]:
df["name"].value_counts()

name
others         860
i20            529
swift dzire    529
swift          514
wagon r        420
              ... 
elantra         17
br-v            17
mobilio         16
freestyle       16
safari          16
Name: count, Length: 97, dtype: int64

In [26]:
X = df[["name","year","km_driven","fuel","seller_type","owner","company","mileage","seats"]]
Y = df["selling_price"]

In [27]:
xtrain,xtest,ytrain,ytest = train_test_split(X, Y , test_size=0.2,random_state=42)

In [11]:
xtrain.sample()

Unnamed: 0,name,year,km_driven,fuel,seller_type,owner,company,mileage,seats
1213,verna,2010,120000,Diesel,Individual,Second Owner,hyundai,22.32,5


In [28]:
logTransform = ColumnTransformer(remainder="passthrough",transformers=[
    ('km_transform',FunctionTransformer(func=np.log1p),[2])
])

In [12]:
xtrf = logTransform.fit_transform(xtrain)

NameError: name 'logTransform' is not defined

In [29]:
encode_data  = ColumnTransformer(remainder="passthrough", transformers=[
    ("encode_data",OneHotEncoder(sparse_output=False, dtype="int32",drop="first") , [1,3,4,6])
])


In [30]:
orderinal_encode = ColumnTransformer(remainder="passthrough",transformers=[
    ("encode_owner",OrdinalEncoder(categories=[["UnRegistered Car","Fourth & Above Owner","Third Owner","Second Owner","First Owner","Test Drive Car"]]),[193])
])

In [31]:
lr = LinearRegression()

In [32]:
pipe = Pipeline([
    ("km_tranform",logTransform),
    ("encode_one_hot", encode_data),
    ("encode_owner",orderinal_encode),
    ("model",lr)
])

In [33]:
pipe.fit(xtrain,ytrain)

ValueError: all features must be in [0, 140] or [-141, 0]

In [23]:

y_pred = pipe.predict(xtest)

print("R2 Score", r2_score(y_true=ytest, y_pred=y_pred) * 100)

R2 Score 90.00123044806769


In [29]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

print("MAE" , mean_absolute_error(y_true=ytest, y_pred=y_pred))

print("MSE" , mean_squared_error(y_true=ytest, y_pred=y_pred))

MAE 0.17563319420384246
MSE 0.06335698741137434


In [39]:
df.sample()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,company,mileage,seats
7437,indica,2007,10.819798,80000,Diesel,Individual,Manual,Third Owner,tata,17.2,5


In [42]:
np.expm1(10.81)

49512.468378402154

In [40]:
input_data = {
    'name': ['indica'],
    'company': ['tata'],
    'year': [2007],
    'km_driven': [80000],
    'fuel': ['Diesel'],
    'transmission': ['Manual'],
    'owner': ['First Owner'],
    'seller_type':"Individual",
    "seats":"5",
    "mileage":"17.2"
} 

 
input_df = pd.DataFrame(input_data) 
res = pipe.predict(input_df)
np.expm1(res)


array([93618.59002391])