In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# %matplotlib inline
mpl.style.use("ggplot")


In [None]:
car = pd.read_csv("./car_resale_prices.csv")
car.head()


In [None]:
print("Shape: ", car.shape)
car.info()


In [None]:
cars_new = pd.DataFrame()
# car[
#     [
#         "full_name",
#         "resale_price",
#         "registered_year",
#         "engine_capacity",
#         "transmission_type",
#         "kms_driven",
#         "owner_type",
#         "fuel_type",
#         "max_power",
#         "seats",
#         "mileage",
#         "body_type",
#     ]
# ]


## Cleaning data
### Removing year from full_name

In [None]:
print(car["full_name"].isnull().sum())

cars_new["full_name"] = car["full_name"].map(lambda x: x[5:])

cars_new["full_name"]


In [None]:
import re

print(car["resale_price"].isnull().sum())

cars_new["resale_price"] = car["resale_price"].map(
    lambda x: float(re.search(r"\d+(\.\d+)?", x).group(0)) * 100000
)

cars_new["resale_price"]


In [None]:
print(car["registered_year"].isnull().sum())

cars_new["registered_year"] = car["full_name"].map(
    lambda x: int(re.search(r"\d{4}", str(x)).group(0))
)

cars_new["registered_year"]


In [None]:
# car["engine_capacity"].info()
# check for null values
print(car["engine_capacity"].isnull().sum())

cars_new["engine_capacity"] = car["engine_capacity"].map(
    lambda x: pd.NA
    if re.search(r"\d+", str(x)) is None
    else int(re.search(r"\d+", str(x)).group(0))
)

cars_new["engine_capacity"]


In [None]:
print(car["transmission_type"].isnull().sum())

cars_new["transmission_type"] = car["transmission_type"]

cars_new["transmission_type"]


In [None]:
print(car["kms_driven"].isnull().sum())

cars_new["kms_driven"] = car["kms_driven"].map(
    lambda x: pd.NA
    if re.search(r"\d+", str(x).replace(",", "")) is None
    else int(re.search(r"\d+", str(x).replace(",", "")).group(0))
)

cars_new["kms_driven"]


In [None]:
print(car["owner_type"].isnull().sum())

cars_new["owner_type"] = car["owner_type"]

cars_new["owner_type"]


In [None]:
print(car["fuel_type"].isnull().sum())

cars_new["fuel_type"] = car["fuel_type"]

cars_new["fuel_type"]


In [None]:
print(car["seats"].isnull().sum())

cars_new["seats"] = car["seats"].map(
    lambda x: pd.NA
    if re.search(r"\d+", str(x)) is None
    else int(re.search(r"\d+", str(x)).group(0))
)

cars_new["seats"]


In [None]:
print(car["mileage"].isnull().sum())

cars_new["mileage"] = car["mileage"].map(
    lambda x: pd.NA
    if re.search(r"\d+(\.\d+)?", str(x)) is None
    else float(re.search(r"\d+(\.\d+)?", str(x)).group(0))
)

cars_new["mileage"]


In [None]:
print(car["body_type"].isnull().sum())
cars_new["body_type"] = car["body_type"]

cars_new["body_type"].unique()


In [None]:
print(car["full_name"].isnull().sum())

cars_new["brand"] = cars_new["full_name"].map(
    lambda x: pd.NA if str(x).split(" ")[0] is None else str(x).split(" ")[0]
)

cars_new["brand"].unique()


In [None]:
cars_new.dropna(how="any", inplace=True)

cleaned_cars = pd.DataFrame()

cleaned_cars["full_name"] = cars_new["full_name"]
cleaned_cars["brand"] = cars_new["brand"]
cleaned_cars["resale_price"] = cars_new["resale_price"]
cleaned_cars["registered_year"] = cars_new["registered_year"]
cleaned_cars["engine_capacity"] = cars_new["engine_capacity"].astype(float)
cleaned_cars["kms_driven"] = cars_new["kms_driven"].astype(int)
cleaned_cars["owner_type"] = cars_new["owner_type"]
cleaned_cars["transmission_type"] = cars_new["transmission_type"]
cleaned_cars["fuel_type"] = cars_new["fuel_type"]
cleaned_cars["mileage"] = cars_new["mileage"].astype(float)
cleaned_cars["body_type"] = cars_new["body_type"]
cleaned_cars["seats"] = cars_new["seats"].astype(int)

cleaned_cars = cleaned_cars.reset_index(drop=True)

cleaned_cars.info()

# export to csv
cleaned_cars.to_csv("./cleaned_cars.csv")


In [None]:
cleaned_cars.describe(include="all")


In [None]:
import seaborn as sns

label = ["Diesel", "Petrol", "CNG", "Electric"]
col = ["red", "blue", "green", "yellow"]

for i, j in zip(label, col):
    sns.kdeplot(
        cleaned_cars.loc[(cleaned_cars["fuel_type"] == i), "resale_price"],
        color=j,
        fill=True,
        label=i,
        legend=True,
    )

# show legend


In [None]:
plt.subplots(figsize=(15, 7))
# change lin color to blue
ax = sns.boxplot(x="brand", y="resale_price", data=cleaned_cars)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")

plt.show()


In [None]:
plt.subplots(figsize=(15, 7))
ax = sns.stripplot(x="registered_year", y="resale_price", data=cleaned_cars)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")

plt.show()


In [None]:
plot = sns.relplot(
    x="kms_driven", y="resale_price", data=cleaned_cars, height=7, aspect=1.5
)

plot.set(xlim=(0, 250000))


In [None]:
plt.subplots(figsize=(15, 7))
# change lin color to blue
ax = sns.lineplot(x="fuel_type", y="resale_price", data=cleaned_cars)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")

plt.show()


In [None]:
ax = sns.relplot(
    x="brand",
    y="resale_price",
    data=cleaned_cars,
    hue="fuel_type",
    size="registered_year",
    height=7,
    aspect=2,
)
ax.set_xticklabels(rotation=40, ha="right")


In [None]:
sns.heatmap(
    data=cleaned_cars[
        [
            "mileage",
            "engine_capacity",
            "seats",
            "resale_price",
            "registered_year",
            "kms_driven",
        ]
    ].corr(),
    cmap="YlGnBu",
    # square=True,
)


In [None]:
features_list = [
    # "full_name",
    # "brand",
    # "fuel_type",
    # "owner_type",
    # "body_type",
    # "transmission_type",
    "mileage",
    "seats",
    "engine_capacity",
    "kms_driven",
    "registered_year",
]

X = cleaned_cars[features_list]

others = [
    "full_name",
    "brand",
    "fuel_type",
    "owner_type",
    "body_type",
    "transmission_type",
]

for col in others:
    X[col] = pd.factorize(cleaned_cars[col])[0]

Y = cleaned_cars["resale_price"]

X


In [None]:
Y


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [31]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf.fit(X_train, Y_train)

rf_score = rf.score(X_test, Y_test)

print(rf_score)


0.6573784099945821


In [34]:
scores = []
for i in range(50):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.1, random_state=i
    )
    rf = RandomForestRegressor()
    rf.fit(X_train, Y_train)
    # Y_pred = rf.predict(X_test)
    scores.append(rf.score(X_test, Y_test))


In [35]:
np.argmax(scores)
scores[np.argmax(scores)]


0.7660143567725998