In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score


In [25]:
df = pd.read_csv("cars.csv")
df.head(3)

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Leather_Wrapped_Steering,Automatic_Headlamps,Engine_Type,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Battery,Electric_Range
0,Tata,Nano Genx,Xt,"Rs. 2,92,667",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,,,,,,,,,,
1,Tata,Nano Genx,Xe,"Rs. 2,36,447",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,,,,,,,,,,
2,Tata,Nano Genx,Emax Xm,"Rs. 2,96,661",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,,,,,,,,,,


In [26]:
df=df[["Make", "Model", "Variant", "Ex-Showroom_Price", "Displacement", "Cylinders",
       "Valves_Per_Cylinder", "Drivetrain", "Cylinder_Configuration", "Emission_Norm"]]
df.head(3)

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm
0,Tata,Nano Genx,Xt,"Rs. 2,92,667",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
1,Tata,Nano Genx,Xe,"Rs. 2,36,447",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
2,Tata,Nano Genx,Emax Xm,"Rs. 2,96,661",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV


In [27]:
df["Displacement"] = df["Displacement"].astype(str).str.replace("cc", "").astype(float)
df["Ex-Showroom_Price"] = df["Ex-Showroom_Price"].astype(str).str.replace("Rs.", "", regex=False).str.replace(",", "", regex=False).astype(int)


In [28]:
y = df["Ex-Showroom_Price"]
x = df.drop("Ex-Showroom_Price", axis=1)


In [29]:
y

0        292667
1        236447
2        296661
3        334768
4        272223
         ...   
1271    1302000
1272    1421000
1273    1431000
1274    1201000
1275    6862560
Name: Ex-Showroom_Price, Length: 1276, dtype: int64

In [30]:
x

Unnamed: 0,Make,Model,Variant,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm
0,Tata,Nano Genx,Xt,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
1,Tata,Nano Genx,Xe,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
2,Tata,Nano Genx,Emax Xm,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
3,Tata,Nano Genx,Xta,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
4,Tata,Nano Genx,Xm,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV
...,...,...,...,...,...,...,...,...,...
1271,Honda,City,Vx Mt Diesel,1498.0,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV
1272,Honda,City,Zx Mt Diesel,1498.0,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV
1273,Honda,City,Zx Cvt Petrol,1497.0,4.0,4.0,FWD (Front Wheel Drive),In-line,BS 6
1274,Honda,City,V Cvt Petrol,1497.0,4.0,4.0,FWD (Front Wheel Drive),In-line,BS 6


In [31]:
x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.25)

In [32]:
categorical_columns = x_train.select_dtypes(include="object").columns.tolist()
numerical_columns = x_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [33]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [34]:
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [35]:
preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_columns),
    ("cat", categorical_pipeline, categorical_columns)
])

In [36]:
model = KNeighborsRegressor(n_neighbors=5)

In [37]:
pipe = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model)
])

In [38]:
pipe.fit(x_train, y_train)


In [39]:
p = pipe.predict(x_test)

In [40]:
mean_squared_error(y_test, p)

28287770074710.062

In [41]:
 r2_score(y_test, p)

0.8366786460257131