## Data Cleaning - Data cleaning is the process of fixing or removing incorrect, incomplete, or duplicate data from a dataset before it's used in data analysis.

#### Loading data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('Car price.csv')
df.head()

Unnamed: 0,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner,Unnamed: 8
0,Maruti 800 AC,2007,60000.0,70000.0,Petrol,Individual,Manual,First Owner,
1,Maruti Wagon R LXI Minor,2007,135000.0,50000.0,Petrol,Individual,Manual,First Owner,
2,Hyundai Verna 1.6 SX,2012,600000.0,100000.0,Diesel,Individual,Manual,First Owner,
3,Datsun RediGO T Option,2017,250000.0,46000.0,Petrol,Individual,Manual,First Owner,
4,Honda Amaze VX i-DTEC,2014,450000.0,141000.0,Diesel,Individual,Manual,Second Owner,


#### Size

In [2]:
df.shape

(4340, 9)

#### Checking for null values

In [3]:
df.isnull()

Unnamed: 0,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner,Unnamed: 8
0,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
4335,False,False,False,False,False,False,False,False,True
4336,False,False,False,False,False,False,False,False,True
4337,False,False,False,False,False,False,False,False,True
4338,False,False,False,False,False,False,False,False,True


In [4]:
df.isnull().sum()

Model               0
Year                0
Selling_Price       5
KM_Driven           4
Fuel                0
Seller_Type         0
Transmission        2
Owner               0
Unnamed: 8       4340
dtype: int64

#### Dropping unknown column

In [10]:

data=df.drop("Unnamed: 8",axis=1)
data.head()

Unnamed: 0,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner
0,Maruti 800 AC,2007,60000.0,70000.0,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000.0,50000.0,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000.0,100000.0,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000.0,46000.0,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000.0,141000.0,Diesel,Individual,Manual,Second Owner


#### Separating Car name from model name in separate column

In [11]:
def car_name(x):
    return x[x.index(" ")+1:]

data["Car name"]=data["Model"].apply(car_name)
data.head()

Unnamed: 0,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner,Car name
0,Maruti 800 AC,2007,60000.0,70000.0,Petrol,Individual,Manual,First Owner,800 AC
1,Maruti Wagon R LXI Minor,2007,135000.0,50000.0,Petrol,Individual,Manual,First Owner,Wagon R LXI Minor
2,Hyundai Verna 1.6 SX,2012,600000.0,100000.0,Diesel,Individual,Manual,First Owner,Verna 1.6 SX
3,Datsun RediGO T Option,2017,250000.0,46000.0,Petrol,Individual,Manual,First Owner,RediGO T Option
4,Honda Amaze VX i-DTEC,2014,450000.0,141000.0,Diesel,Individual,Manual,Second Owner,Amaze VX i-DTEC


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Model          4340 non-null   object 
 1   Year           4340 non-null   int64  
 2   Selling_Price  4335 non-null   float64
 3   KM_Driven      4336 non-null   float64
 4   Fuel           4340 non-null   object 
 5   Seller_Type    4340 non-null   object 
 6   Transmission   4338 non-null   object 
 7   Owner          4340 non-null   object 
 8   Car name       4340 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 305.3+ KB


#### Function to remove decimal

In [36]:
def float_int(x):
    a=str(x)
    b=a.split(".")
    return (b[0])


In [35]:
data["Selling_Price"]

0        60000.0
1       135000.0
2       600000.0
3       250000.0
4       450000.0
          ...   
4335    409999.0
4336    409999.0
4337    110000.0
4338    865000.0
4339    225000.0
Name: Selling_Price, Length: 4340, dtype: float64

In [42]:
data["Selling_Price"]=data["Selling_Price"].apply(float_int)

In [39]:
data["KM_Driven"]

0        70000.0
1        50000.0
2       100000.0
3        46000.0
4       141000.0
          ...   
4335     80000.0
4336     80000.0
4337     83000.0
4338     90000.0
4339     40000.0
Name: KM_Driven, Length: 4340, dtype: float64

In [43]:
data["KM_Driven"]=data["KM_Driven"].apply(float_int)

In [44]:
data.head()

Unnamed: 0,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner,Car name
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner,800 AC
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner,Wagon R LXI Minor
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner,Verna 1.6 SX
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner,RediGO T Option
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner,Amaze VX i-DTEC


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Model          4340 non-null   object
 1   Year           4340 non-null   int64 
 2   Selling_Price  4340 non-null   object
 3   KM_Driven      4340 non-null   object
 4   Fuel           4340 non-null   object
 5   Seller_Type    4340 non-null   object
 6   Transmission   4338 non-null   object
 7   Owner          4340 non-null   object
 8   Car name       4340 non-null   object
dtypes: int64(1), object(8)
memory usage: 305.3+ KB


#### To replace "," from 80,000
#### We use data[x].str.replace(","," ")