In [1]:
import numpy as np
import pandas as pd

In [83]:
# train_df = pd.read_csv("cars24-car-price.csv")

train_df = pd.read_csv("car-price.csv")

In [84]:
train_df.head()

Unnamed: 0,full_name,selling_price,year,seller_type,km_driven,fuel_type,transmission_type,mileage,engine,max_power
0,Maruti Alto Std,1.2,2012.0,Individual,120000,Petrol,Manual,19.7,796.0,46.3
1,Hyundai Grand i10 Asta,5.5,2016.0,Individual,20000,Petrol,Manual,18.9,1197.0,82.0
2,Hyundai i20 Asta,2.15,2010.0,Individual,60000,Petrol,Manual,17.0,1197.0,80.0
3,Maruti Alto K10 2010-2014 VXI,2.26,2012.0,Individual,37000,Petrol,Manual,20.92,998.0,67.1
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7,2015.0,Dealer,30000,Diesel,Manual,22.77,1498.0,98.59


In [85]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19980 entries, 0 to 19979
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   full_name          19980 non-null  object 
 1   selling_price      19980 non-null  float64
 2   year               19980 non-null  float64
 3   seller_type        19980 non-null  object 
 4   km_driven          19980 non-null  int64  
 5   fuel_type          19980 non-null  object 
 6   transmission_type  19980 non-null  object 
 7   mileage            19980 non-null  float64
 8   engine             19980 non-null  float64
 9   max_power          19980 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 1.5+ MB


## Encoding of categorical values

In [86]:
train_df['seller_type'].unique()

array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)

In [87]:
train_df['transmission_type'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [88]:
train_df['fuel_type'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object)

In [89]:
# creating a dictionary to encode categorical features
encode_dict = {
    "fuel_type": {"Diesel": 1, "Petrol": 2, "CNG": 3, "LPG": 4, "Electric": 5},
    "seller_type": {"Dealer":1, "Individual":2, "Trustmark Dealer":3},
    "transmission_type": {"Manual":1, "Automatic":2}
}

In [90]:
train_df['seller_type'] = train_df['seller_type'].map(encode_dict['seller_type'])

In [91]:
train_df['transmission_type'] = train_df['transmission_type'].map(encode_dict['transmission_type'])

In [92]:
train_df['fuel_type'] = train_df['fuel_type'].map(encode_dict['fuel_type'])

In [93]:
train_df.head()

Unnamed: 0,full_name,selling_price,year,seller_type,km_driven,fuel_type,transmission_type,mileage,engine,max_power
0,Maruti Alto Std,1.2,2012.0,2,120000,2,1,19.7,796.0,46.3
1,Hyundai Grand i10 Asta,5.5,2016.0,2,20000,2,1,18.9,1197.0,82.0
2,Hyundai i20 Asta,2.15,2010.0,2,60000,2,1,17.0,1197.0,80.0
3,Maruti Alto K10 2010-2014 VXI,2.26,2012.0,2,37000,2,1,20.92,998.0,67.1
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7,2015.0,1,30000,1,1,22.77,1498.0,98.59


## Checking for missing values

In [94]:
train_df.isnull().sum()

full_name            0
selling_price        0
year                 0
seller_type          0
km_driven            0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
dtype: int64

### Splittin the data

In [95]:
X = train_df.iloc[:, 2:]
y = train_df.iloc[:, 1]

In [96]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.2, random_state=5)

### Model training

In [97]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

### Testing

In [98]:
from sklearn.metrics import r2_score

score = r2_score(Y_val, model.predict(X_val))
score

0.5161051198190391

## Saving the model

In [99]:
import pickle

pickle_out = open('car_pred.pkl', mode='wb')
pickle.dump(model, pickle_out)
pickle_out.close()

## Test prediction for a sample data point

In [100]:
X_val.keys()

Index(['year', 'seller_type', 'km_driven', 'fuel_type', 'transmission_type',
       'mileage', 'engine', 'max_power'],
      dtype='object')

In [45]:
# car_features = [[2018, 1, 40000, 1, 1, 19.7, 500, 86.3, 2]]

In [101]:
car_features = {
    "year" : [2018],
    "seller_type" : [1],
    "km_driven" : [40000],
    "fuel_type" : [1],
    "transmission_type" : [1],
    "mileage" : [19.7],
    "engine" : [500],
    "max_power" : [86.3]
} 

In [102]:
file = open('car_pred.pkl', mode='rb')
model = pickle.load(file)

In [103]:
car_features = pd.DataFrame(data = car_features)
car_features

Unnamed: 0,year,seller_type,km_driven,fuel_type,transmission_type,mileage,engine,max_power
0,2018,1,40000,1,1,19.7,500,86.3


In [104]:
model.predict(car_features)

array([5.00564837])