In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

In [3]:
used_cars = pd.read_csv("C:\\Users\\vidus\\Downloads\\Used_car_RawData.csv")

In [4]:
used_cars.head()

# raw data 👇

Unnamed: 0,full_name,brand_name,selling_price,new_price,year,seller_type,km_driven,owner_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto Std,Maruti,1.2 Lakh*,,2012,Individual,"1,20,000 kms",First Owner,Petrol,Manual,Mileage19.7 kmpl,Engine796 CC,Max Power46.3 bhp,Seats5
1,Hyundai Grand i10 Asta,Hyundai,5.5 Lakh*,New Car (On-Road Price) : Rs.7.11-7.48 Lakh*,2016,Individual,"20,000 kms",First Owner,Petrol,Manual,Mileage18.9 kmpl,Engine1197 CC,Max Power82 bhp,Seats5
2,Hyundai i20 Asta,Hyundai,2.15 Lakh*,,2010,Individual,"60,000 kms",First Owner,Petrol,Manual,Mileage17.0 kmpl,Engine1197 CC,Max Power80 bhp,Seats5
3,Maruti Alto K10 2010-2014 VXI,Maruti,2.26 Lakh*,,2012,Individual,"37,000 kms",First Owner,Petrol,Manual,Mileage20.92 kmpl,Engine998 CC,Max Power67.1 bhp,Seats5
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,Ford,5.7 Lakh*,New Car (On-Road Price) : Rs.10.14-13.79 Lakh*,2015,Dealer,"30,000 kms",First Owner,Diesel,Manual,Mileage22.77 kmpl,Engine1498 CC,Max Power98.59 bhp,Seats5


*Given below are some problems present in raw dataset,*

*  NULL values present in some columns
*  Text present with numerical values in columns
*  No column present with Car brand name

**So, we will overcome these problems by cleaning raw data and making data efficient enough to perform various operations on it**

In [5]:
null_values = used_cars.isnull().sum()
null_values

full_name                0
brand_name               0
selling_price            0
new_price            10410
year                     0
seller_type              0
km_driven                0
owner_type               0
fuel_type                0
transmission_type        0
mileage                160
engine                 110
max_power              337
seats                   87
dtype: int64

**We can see there are null values present in our datset which sums to approx 10000 i.e nearly 50% of our dataset**\
**As I checked the data thoroughly and removal of null values wont affect the overall insights**

In [6]:
used_cars = used_cars.dropna()

In [7]:
# REMOVING TEXT AND KEEPING ONLY NUMERICAL VALUES IN "new_price" COLUMN AND SAVING THE UPDATED VALUES IN NEW COLUMN i.e "original_price"

originals = []
for i in used_cars["new_price"]:
  
    crore = False
    s = i.split(":")[-1]

    if "Cr*" in s:
        crore = True

    if "-" in s:
        original = s.split("-")[-1]
        original = float(original.split(" ")[0])

        if crore:
            original = original * 100

        originals.append(original)
       
    else:
     
        without_dash = s.strip().split(" ")[0]
        without_dash = float(without_dash.replace("Rs.",""))
        
        if crore:
            without_dash = without_dash * 100

        originals.append(without_dash)
       




In [8]:
used_cars["original_price"] = originals

In [9]:
# REMOVING TEXT AND KEEPING ONLY NUMERICAL VALUES IN "selling_price" COLUMN AND SAVING THE UPDATED VALUES IN NEW COLUMN i.e "SP"

sell_pr = []
for s in used_cars["selling_price"]:

    crore = False
    thousand = False
    
    if "Cr*" in s:
        crore = True

    if "Cr*" not in s and "Lakh*" not in s:
        thousand = True

    sp = s.split(" ")[0]
    sp = sp.replace("*","")
    sp = sp.replace(",","")

    sp = float(sp)

    if crore:
           sp = sp * 100

    if thousand:
        sp = sp / 100000

    sell_pr.append(sp)



In [10]:
used_cars["SP"] = sell_pr

In [11]:
# REMOVING TEXT AND KEEPING ONLY NUMERICAL VALUES IN "mileage" COLUMN AND SAVING THE UPDATED VALUES IN NEW COLUMN i.e "mileage_cars(kmpl)"

mileage_cars = []
for m in used_cars["mileage"]:
    mileage_car = m.split(" ")[0]
    mileage_cars.append(float(mileage_car.replace("Mileage","")))


In [12]:
used_cars["mileage_cars(kmpl)"] = mileage_cars

In [13]:
# REMOVING TEXT AND KEEPING ONLY NUMERICAL VALUES IN "engine" COLUMN AND SAVING THE UPDATED VALUES IN NEW COLUMN i.e "engine_cc"


engine_cc = []
for e in used_cars["engine"]:
    engine_c = e.split(" ")[0]
    engine_cc.append(float(engine_c.replace("Engine","")))

engine_cc
used_cars["engine_cc"] = engine_cc

In [14]:
used_cars["engine_cc"] = engine_cc

In [15]:
# REMOVING TEXT AND KEEPING ONLY NUMERICAL VALUES IN "max_power" COLUMN AND SAVING THE UPDATED VALUES IN NEW COLUMN i.e "maxx_power"


maxx_power = []
for m in used_cars["max_power"]:
    power = m.split(" ")[1]
    maxx_power.append(float(power.replace("Power","")))

In [16]:
used_cars["maxx_power"] = maxx_power

In [17]:
# REMOVING TEXT AND KEEPING ONLY NUMERICAL VALUES IN "km_driven" COLUMN AND SAVING THE UPDATED VALUES IN NEW COLUMN i.e "Kmeter_driven"

kilo = []
for k in used_cars["km_driven"]:
    # print(k)
    km = k.split(" ")[0]
    km = km.replace(",","")
    
    kilo.append(float(km))


In [18]:
used_cars["Kmeter_driven"] = kilo

In [19]:
# WE NEED TO REMOVE SEAT TEXT IN SEAT COLUMN
seat = []
for s in used_cars["seats"]:
    rep = s.replace("Seats","")
    seat.append(float(rep))

In [20]:
used_cars["seat"] = seat

In [21]:
# here we are removing old columns all together 

colls_to_drops = ['mileage','engine','max_power','selling_price','new_price','km_driven','seats']

In [22]:
used_cars = used_cars.drop(columns = colls_to_drops)

In [23]:
used_cars.head()

Unnamed: 0,full_name,brand_name,year,seller_type,owner_type,fuel_type,transmission_type,original_price,SP,mileage_cars(kmpl),engine_cc,maxx_power,Kmeter_driven,seat
1,Hyundai Grand i10 Asta,Hyundai,2016,Individual,First Owner,Petrol,Manual,7.48,5.5,18.9,1197.0,82.0,20000.0,5.0
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,Ford,2015,Dealer,First Owner,Diesel,Manual,13.79,5.7,22.77,1498.0,98.59,30000.0,5.0
5,Maruti Wagon R VXI BS IV,Maruti,2013,Individual,First Owner,Petrol,Manual,6.94,3.5,18.9,998.0,67.1,35000.0,5.0
6,Hyundai i10 Sportz 1.2,Hyundai,2013,Dealer,First Owner,Petrol,Manual,6.63,3.15,20.36,1197.0,78.9,40000.0,5.0
7,Maruti Wagon R VXI,Maruti,2018,Dealer,First Owner,Petrol,Manual,7.01,4.1,20.51,998.0,67.04,17512.0,5.0


In [24]:
# saving the cleaned data into a new csv file 

used_cars.to_csv('used_cars_cleanedd.csv')