# Ford used car Machine Learning price prediction


goal of this notebook to use machine learning techniques to predict price of Ford used car dataset
    

## Importing of required libraries and packages

In [1]:
# import the data analysis libraries
import pandas as pd
import numpy as np

# Data Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder


# generate clear images
sns.set_context('talk')

# for desired color palette 
plot_color = sns.color_palette("colorblind")

# set the background of the images
sns.set_style('darkgrid')

### Reading CSV file

In [2]:
df = pd.read_csv("ford.csv")

display(df.head())

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,48.7,1.0


### Given Feature variables:
**model**: categories of the ford car
<br>**year**: the year car was made
<br>**transmission**: the type of trasmission the car has
<br>**mileage**: the number of miles the vehicle has driven
<br>**fuelType**: energy source of the vehicle
<br>**tax**: road tax
<br>**mpg**: miles per gallon the vehicle can travel
<br>**engineSize**: Engine size is the volume of fuel and air that can be pushed through a car's cylinders

### Target Variable:
**price**: selling price of the car

### Check for missing values and data type

In [3]:
#check of missing values
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17964 non-null  object 
 1   year          17964 non-null  int64  
 2   price         17964 non-null  int64  
 3   transmission  17964 non-null  object 
 4   mileage       17964 non-null  int64  
 5   fuelType      17964 non-null  object 
 6   mpg           17964 non-null  float64
 7   engineSize    17964 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 1.1+ MB


None

Number of observations (rows) = 17964
<br>Number of columns = 8 (7 feature variables and 1 target variable)
<br>Number of categorical variables = 3: model, transmission, fuelType
<br>Number of numerical variables = 4: year, mileage, mpg, engineSize
#### The dataset has no missing values

In [4]:
# statistics of the numerical variables
display(df.describe())

Unnamed: 0,year,price,mileage,mpg,engineSize
count,17964.0,17964.0,17964.0,17964.0,17964.0
mean,2016.864173,12280.078435,23361.880149,57.907832,1.350824
std,2.024987,4741.318119,19471.243292,10.125632,0.432383
min,1996.0,495.0,1.0,20.8,0.0
25%,2016.0,8999.0,9987.0,52.3,1.0
50%,2017.0,11291.0,18242.5,58.9,1.2
75%,2018.0,15299.0,31052.0,65.7,1.5
max,2020.0,54995.0,177644.0,201.8,5.0


## Splitting Dataset

### Separating the features and target variable

In [8]:
# Create feature and target lists
features = ['mileage', 'year', 'mpg', 'engineSize']
target = ['price']

# Create feature and target dataframes

X = df[features]
y = df[target]

# Create feature and target dataframes for one hot encoding 

# X = df_one.drop('price', axis=1)
# y = df_one[target]

# Create feature and target dataframes for label encoding 

# X = df_le.drop('price', axis=1)
# y = df_le[target]

# display the dataframe shapes
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

Shape of X:  (17964, 4)
Shape of y:  (17964, 1)


### Splitting the Data into Train Set and Test Set

In [9]:
# Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Display Split Data shapes
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)


X_train shape:  (14371, 4)
y_train shape:  (14371, 1)
X_test shape:  (3593, 4)
y_test shape:  (3593, 1)


## Training Model

### Perform Linear Regression


In [10]:
# Create a model instance
model = LinearRegression()

# Fit data to model
model = model.fit(X_train, y_train)

## Prediction

In [13]:
# Predict from the test set features
y_pred = model.predict(X_test)

In [17]:
df_predict = pd.DataFrame()

df_predict['original_price'] = y_test['price']
df_predict['predicted_price'] = y_pred
df_predict['observation'] = np.arange(0, y_test.shape[0] , 1)


display(df_predict.head())

Unnamed: 0,original_price,predicted_price,observation
1087,16700,12511.620219,0
9367,9690,11728.136925,1
4705,10999,12146.831471,2
10336,29350,23221.441209,3
8509,11250,12057.247285,4


### Prediction Error


In [18]:
#Prediction error

# root mean squared error
RMSE = mean_squared_error(y_test, y_pred, squared=False)
print("root mean squared error = ", RMSE)


root mean squared error =  2471.5647447160904
