# Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('./dataSet/cars_dataset.csv')
dataset.head(100)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Make
0,A1,2017,12500,Manual,15735,Petrol,150.0,55.4,1.4,audi
1,A6,2016,16500,Automatic,36203,Diesel,20.0,64.2,2.0,audi
2,A1,2016,11000,Manual,29946,Petrol,30.0,55.4,1.4,audi
3,A4,2017,16800,Automatic,25952,Diesel,145.0,67.3,2.0,audi
4,A3,2019,17300,Manual,1998,Petrol,145.0,49.6,1.0,audi
...,...,...,...,...,...,...,...,...,...,...
95,A6,2016,17300,Automatic,24421,Diesel,30.0,61.4,2.0,audi
96,A1,2016,10600,Manual,15567,Diesel,0.0,76.3,1.6,audi
97,A4,2018,17500,Manual,15207,Petrol,145.0,51.4,1.4,audi
98,A4,2017,15000,Manual,14864,Petrol,145.0,52.3,1.4,audi


## Create new Data Set 

#### Get only "toyota" cars 

In [3]:
new_dataset = dataset[dataset['Make'] == "toyota"].reset_index()
new_dataset

Unnamed: 0,index,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Make
0,54570,GT86,2016,16000,Manual,24089,Petrol,265.0,36.2,2.0,toyota
1,54571,GT86,2017,15995,Manual,18615,Petrol,145.0,36.2,2.0,toyota
2,54572,GT86,2015,13998,Manual,27469,Petrol,265.0,36.2,2.0,toyota
3,54573,GT86,2017,18998,Manual,14736,Petrol,150.0,36.2,2.0,toyota
4,54574,GT86,2017,17498,Manual,36284,Petrol,145.0,36.2,2.0,toyota
...,...,...,...,...,...,...,...,...,...,...,...
6733,61303,IQ,2011,5500,Automatic,30000,Petrol,20.0,58.9,1.0,toyota
6734,61304,Urban Cruiser,2011,4985,Manual,36154,Petrol,125.0,50.4,1.3,toyota
6735,61305,Urban Cruiser,2012,4995,Manual,46000,Diesel,125.0,57.6,1.4,toyota
6736,61306,Urban Cruiser,2011,3995,Manual,60700,Petrol,125.0,50.4,1.3,toyota


In [4]:
new_dataset = new_dataset.drop(columns=['index', 'Make'])
new_dataset

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,GT86,2016,16000,Manual,24089,Petrol,265.0,36.2,2.0
1,GT86,2017,15995,Manual,18615,Petrol,145.0,36.2,2.0
2,GT86,2015,13998,Manual,27469,Petrol,265.0,36.2,2.0
3,GT86,2017,18998,Manual,14736,Petrol,150.0,36.2,2.0
4,GT86,2017,17498,Manual,36284,Petrol,145.0,36.2,2.0
...,...,...,...,...,...,...,...,...,...
6733,IQ,2011,5500,Automatic,30000,Petrol,20.0,58.9,1.0
6734,Urban Cruiser,2011,4985,Manual,36154,Petrol,125.0,50.4,1.3
6735,Urban Cruiser,2012,4995,Manual,46000,Diesel,125.0,57.6,1.4
6736,Urban Cruiser,2011,3995,Manual,60700,Petrol,125.0,50.4,1.3


In [5]:
new_dataset.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,6738.0,6738.0,6738.0,6738.0,6738.0,6738.0
mean,2016.748145,12522.391066,22857.413921,94.69724,63.042223,1.471297
std,2.204062,6345.017587,19125.464147,73.880776,15.83671,0.436159
min,1998.0,850.0,2.0,0.0,2.8,0.0
25%,2016.0,8290.0,9446.0,0.0,55.4,1.0
50%,2017.0,10795.0,18513.0,135.0,62.8,1.5
75%,2018.0,14995.0,31063.75,145.0,69.0,1.8
max,2020.0,59995.0,174419.0,565.0,235.0,4.5


In [6]:
new_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6738 entries, 0 to 6737
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         6738 non-null   object 
 1   year          6738 non-null   int64  
 2   price         6738 non-null   int64  
 3   transmission  6738 non-null   object 
 4   mileage       6738 non-null   int64  
 5   fuelType      6738 non-null   object 
 6   tax           6738 non-null   float64
 7   mpg           6738 non-null   float64
 8   engineSize    6738 non-null   float64
dtypes: float64(3), int64(3), object(3)
memory usage: 473.9+ KB


In [7]:
new_dataset.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

## Check categorical variables

In [8]:
new_dataset['model'].unique()

array([' GT86', ' Corolla', ' RAV4', ' Yaris', ' Auris', ' Aygo', ' C-HR',
       ' Prius', ' Avensis', ' Verso', ' Hilux', ' PROACE VERSO',
       ' Land Cruiser', ' Supra', ' Camry', ' Verso-S', ' IQ',
       ' Urban Cruiser'], dtype=object)

In [9]:
new_dataset['transmission'].unique()

array(['Manual', 'Automatic', 'Semi-Auto', 'Other'], dtype=object)

In [10]:
new_dataset['fuelType'].unique()

array(['Petrol', 'Other', 'Hybrid', 'Diesel'], dtype=object)

In [14]:
new_dataset['year'].unique()

array([2016, 2017, 2015, 2020, 2013, 2019, 2018, 2014, 2012, 2005, 2003,
       2004, 2001, 2008, 2007, 2010, 2011, 2006, 2009, 2002, 1999, 2000,
       1998])

In [20]:
new_dataset['engineSize'].unique()

array([2. , 1.8, 1.2, 1.6, 1.4, 2.5, 2.2, 1.5, 1. , 1.3, 0. , 2.4, 3. ,
       2.8, 4.2, 4.5])

## Switch Columns

In [11]:
new_dataset = new_dataset[['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize', 'price']]
new_dataset

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,GT86,2016,Manual,24089,Petrol,265.0,36.2,2.0,16000
1,GT86,2017,Manual,18615,Petrol,145.0,36.2,2.0,15995
2,GT86,2015,Manual,27469,Petrol,265.0,36.2,2.0,13998
3,GT86,2017,Manual,14736,Petrol,150.0,36.2,2.0,18998
4,GT86,2017,Manual,36284,Petrol,145.0,36.2,2.0,17498
...,...,...,...,...,...,...,...,...,...
6733,IQ,2011,Automatic,30000,Petrol,20.0,58.9,1.0,5500
6734,Urban Cruiser,2011,Manual,36154,Petrol,125.0,50.4,1.3,4985
6735,Urban Cruiser,2012,Manual,46000,Diesel,125.0,57.6,1.4,4995
6736,Urban Cruiser,2011,Manual,60700,Petrol,125.0,50.4,1.3,3995


In [12]:
new_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6738 entries, 0 to 6737
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         6738 non-null   object 
 1   year          6738 non-null   int64  
 2   transmission  6738 non-null   object 
 3   mileage       6738 non-null   int64  
 4   fuelType      6738 non-null   object 
 5   tax           6738 non-null   float64
 6   mpg           6738 non-null   float64
 7   engineSize    6738 non-null   float64
 8   price         6738 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 473.9+ KB


## Export new data set

In [13]:
#export new main data csv file for moving to next step
new_dataset.to_csv('main_data.csv', index=False)