In [121]:
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

%matplotlib inline

In [80]:
df = pd.read_csv("car details v4.csv")

In [82]:
df.shape

(2059, 20)

In [83]:
df = df.dropna(how="any")

In [84]:
df.shape

(1874, 20)

In [85]:
df.isnull().sum()

Make                  0
Model                 0
Price                 0
Year                  0
Kilometer             0
Fuel Type             0
Transmission          0
Location              0
Color                 0
Owner                 0
Seller Type           0
Engine                0
Max Power             0
Max Torque            0
Drivetrain            0
Length                0
Width                 0
Height                0
Seating Capacity      0
Fuel Tank Capacity    0
dtype: int64

In [86]:
def replace_RPM(text):
    text = text.split("@")
    text = text[1]
    text = text.replace("rpm", "")
    text = int(text)
    return text

In [87]:
df["Max Torque"] = df["Max Torque"].apply(replace_RPM)

In [89]:
df["Engine"].value_counts()

1197 cc    230
1248 cc    116
998 cc     108
1497 cc     83
1968 cc     82
          ... 
3436 cc      1
4806 cc      1
2979 cc      1
3496 cc      1
995 cc       1
Name: Engine, Length: 104, dtype: int64

In [90]:
def replace_cc(text):
    text = text.replace("cc", "")
    text = int(text)
    return text

In [91]:
df["Engine"] = df["Engine"].apply(replace_cc)

In [92]:
def replace_power(text):
    text = text.split("@")
    text = text[0]
    text = text.replace("bhp", "")
    text = float(text)
    return text

In [93]:
df["Max Power"] = df["Max Power"].apply(replace_power)

In [95]:
le_fueltype = LabelEncoder().fit(df["Fuel Type"])
le_nm_fueltype = dict(zip(le_fueltype.classes_, le_fueltype.transform(le_fueltype.classes_)))
df["Fuel Type"] = df["Fuel Type"].apply(lambda x: le_nm_fueltype[x])

In [96]:
le_nm_fueltype

{'CNG': 0,
 'CNG + CNG': 1,
 'Diesel': 2,
 'Hybrid': 3,
 'LPG': 4,
 'Petrol': 5,
 'Petrol + CNG': 6}

In [98]:
df = df.drop(["Make", "Model"], axis=1)

In [100]:
le_owner = LabelEncoder().fit(df["Owner"])
le_nm_owner = dict(zip(le_owner.classes_, le_owner.transform(le_owner.classes_)))
df["Owner"] = df["Owner"].apply(lambda x: le_nm_owner[x])

In [101]:
le_nm_owner

{'First': 0, 'Second': 1, 'Third': 2, 'UnRegistered Car': 3}

In [103]:
le_transmission = LabelEncoder().fit(df["Transmission"])
le_nm_transmission = dict(zip(le_transmission.classes_, le_transmission.transform(le_transmission.classes_)))
df["Transmission"] = df["Transmission"].apply(lambda x: le_nm_transmission[x])

In [104]:
le_nm_transmission

{'Automatic': 0, 'Manual': 1}

In [106]:
df = df.drop(["Location", "Color"], axis=1)

In [108]:
le_sellertype = LabelEncoder().fit(df["Seller Type"])
le_nm_sellertype = dict(zip(le_sellertype.classes_, le_sellertype.transform(le_sellertype.classes_)))
df["Seller Type"] = df["Seller Type"].apply(lambda x: le_nm_sellertype[x])

In [109]:
le_nm_sellertype

{'Commercial Registration': 0, 'Corporate': 1, 'Individual': 2}

In [110]:
le_drivetrain = LabelEncoder().fit(df["Drivetrain"])
le_nm_drivetrain = dict(zip(le_drivetrain.classes_, le_drivetrain.transform(le_drivetrain.classes_)))
df["Drivetrain"] = df["Drivetrain"].apply(lambda x: le_nm_drivetrain[x])

In [111]:
le_nm_drivetrain

{'AWD': 0, 'FWD': 1, 'RWD': 2}

In [113]:
X = df.drop(["Price"], axis=1)
y = df["Price"]

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [115]:
X_train.shape, y_test.shape

((1499, 15), (375,))

In [116]:
y.dtypes

dtype('int64')

In [117]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Random Forest: ", rf_model.score(X_train, y_train))
print("Random Forest: ", rf_model.score(X_test, y_test))

Random Forest:  0.9799212345517739
Random Forest:  0.866285227998213


In [118]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print("Linear Regression: ", linreg.score(X_train, y_train))
print("Linear Regression: ", linreg.score(X_test, y_test))

Linear Regression:  0.680071078006509
Linear Regression:  0.6594717455267607


In [122]:
pickle.dump(rf_model, open("RFcar.pkl", "wb"))