**Importing the Dependencies**

In [398]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

**Data Collection**

In [399]:
test_df = pd.read_csv("/content/Test_Data_Set.csv")
train_df = pd.read_csv("/content/Training_Data_Set.csv")

In [400]:
#checking training data shape
train_df.shape

(53515, 17)

In [401]:
# checking test data
test_df.shape

(5615, 16)

# Data Pre-Processing

In [402]:
# Identify columns with missing values
print(train_df.isnull().sum())
print("----------------------")
print(test_df.isnull().sum())

Id                         0
Maker                      0
model                      0
Location                   0
Distance                1211
Owner Type                 0
manufacture_year           0
Age of car                 0
engine_displacement        0
engine_power            1439
body_type              49379
Vroom Audit Rating         0
transmission               0
door_count                 0
seat_count                 0
fuel_type                  0
Price                      0
dtype: int64
----------------------
Id                        0
Maker                     0
model                     0
Location                  0
Distance                128
Owner Type                0
manufacture_year          0
Age of car                0
engine_displacement       0
engine_power            168
body_type              5193
Vroom Audit Rating        0
transmission              0
door_count                0
seat_count                0
fuel_type                 0
dtype: int64


### Dropping body type column as there are 49379 nan values, the total dataset is 53515


In [403]:
train_df.drop(["body_type"], axis = 1, inplace = True)
test_df.drop(["body_type"], axis = 1, inplace = True)


In [404]:
train_df.dropna(axis=0, inplace = True)


In [405]:
test_df.fillna(test_df.mean(), inplace = True)

  test_df.fillna(test_df.mean(), inplace = True)


In [406]:
test_df.shape

(5615, 15)

In [407]:
print(train_df.isnull().sum())
print("----------------------")
print(test_df.isnull().sum())

Id                     0
Maker                  0
model                  0
Location               0
Distance               0
Owner Type             0
manufacture_year       0
Age of car             0
engine_displacement    0
engine_power           0
Vroom Audit Rating     0
transmission           0
door_count             0
seat_count             0
fuel_type              0
Price                  0
dtype: int64
----------------------
Id                     0
Maker                  0
model                  0
Location               0
Distance               0
Owner Type             0
manufacture_year       0
Age of car             0
engine_displacement    0
engine_power           0
Vroom Audit Rating     0
transmission           0
door_count             0
seat_count             0
fuel_type              0
dtype: int64


In [408]:
# Identify columns with none values
print(test_df.isin(['None']).sum())

Id                       0
Maker                    0
model                    0
Location                 0
Distance                 0
Owner Type               0
manufacture_year         0
Age of car               0
engine_displacement      0
engine_power             0
Vroom Audit Rating       0
transmission             0
door_count             806
seat_count             891
fuel_type                0
dtype: int64


# Removing None


In [409]:
#for None values
# most_frequent = test_df['seat_count'].value_counts().index[0]

train_df['seat_count'].replace('None', 0, inplace=True)
test_df['seat_count'].replace('None', 0, inplace=True)

# most_frequent = test_df['door_count'].value_counts().index[0]

train_df['door_count'].replace('None', 0, inplace=True)
test_df['door_count'].replace('None', 0, inplace=True)

In [410]:
train_df = train_df.astype({'seat_count': 'int', 'door_count':'int'})
test_df = test_df.astype({'seat_count': 'int', 'door_count':'int'})
train_df.info()
print("------------------")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51119 entries, 1 to 53514
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   51119 non-null  int64  
 1   Maker                51119 non-null  object 
 2   model                51119 non-null  object 
 3   Location             51119 non-null  object 
 4   Distance             51119 non-null  float64
 5   Owner Type           51119 non-null  object 
 6   manufacture_year     51119 non-null  int64  
 7   Age of car           51119 non-null  int64  
 8   engine_displacement  51119 non-null  int64  
 9   engine_power         51119 non-null  float64
 10  Vroom Audit Rating   51119 non-null  int64  
 11  transmission         51119 non-null  object 
 12  door_count           51119 non-null  int64  
 13  seat_count           51119 non-null  int64  
 14  fuel_type            51119 non-null  object 
 15  Price                51119 non-null 

In [411]:
# checking columns with None Value in training & test data
print(test_df.isin(['None']).sum())
print("------------------------------")
print(train_df.isin(['None']).sum())

Id                     0
Maker                  0
model                  0
Location               0
Distance               0
Owner Type             0
manufacture_year       0
Age of car             0
engine_displacement    0
engine_power           0
Vroom Audit Rating     0
transmission           0
door_count             0
seat_count             0
fuel_type              0
dtype: int64
------------------------------
Id                     0
Maker                  0
model                  0
Location               0
Distance               0
Owner Type             0
manufacture_year       0
Age of car             0
engine_displacement    0
engine_power           0
Vroom Audit Rating     0
transmission           0
door_count             0
seat_count             0
fuel_type              0
Price                  0
dtype: int64


-------------------------------------------------

# **Checking for distribution of categorical data**

In [412]:
print(test_df.transmission.value_counts())

man     3893
auto    1722
Name: transmission, dtype: int64


In [413]:
print(test_df.fuel_type.value_counts())

diesel    2883
petrol    2732
Name: fuel_type, dtype: int64


**Encoding categorical data**

In [414]:
#Encoding body_type column
# test_df.replace({'body_type':{'compact':0, 'van':1}},inplace=True)
# train_df.replace({'body_type':{'compact':0, 'van':1}},inplace=True)


In [415]:
#Encoding transmission column

test_df.replace({'transmission':{'man':0, 'auto':1}},inplace=True)
train_df.replace({'transmission':{'man':0, 'auto':1}},inplace=True)



In [416]:
#Encoding fuel_type column

test_df.replace({'fuel_type':{'diesel':0, 'petrol':1}},inplace=True)
train_df.replace({'fuel_type':{'diesel':0, 'petrol':1}},inplace=True)

In [417]:
test_df.Maker.unique()

array(['skoda', 'audi', 'bmw', 'toyota', 'nissan', 'fiat', 'hyundai',
       'maserati'], dtype=object)

In [418]:
#Encoding Maker column
test_df.replace({'Maker':{'skoda':0, 'fiat':1, 'bmw':2, 'nissan':3, 'audi':4, 'toyota':5, 'hyundai':6,'maserati':7}},inplace=True)
train_df.replace({'Maker':{'skoda':0, 'fiat':1, 'bmw':2, 'nissan':3, 'audi':4, 'toyota':5, 'hyundai':6,'maserati':7}},inplace=True)


In [419]:
#Renaming the "Owner Type" column name to 'Owner_Type'

test_df.rename(columns={"Owner Type": "Owner_Type"}, inplace=True)
train_df.rename(columns={"Owner Type": "Owner_Type"}, inplace=True)


In [420]:
#Encoding Owner_Type column

test_df.replace({'Owner_Type':{'First':1, 'Second':2, 'Third':3, 'Fourth & Above':4}},inplace=True)
train_df.replace({'Owner_Type':{'First':1, 'Second':2, 'Third':3, 'Fourth & Above':4}},inplace=True)




In [421]:
#Encoding Location column

test_df.replace({'Location':{'Mumbai':0, 'Coimbatore':1, 'Delhi':2, 'Chennai':3, 'Bangalore':4, 'Pune':5,'Jaipur':6, 'Hyderabad':7, 'Ahmedabad':8, 'Kolkata':9, 'Kochi':10}},inplace=True)
train_df.replace({'Location':{'Mumbai':0, 'Coimbatore':1, 'Delhi':2, 'Chennai':3, 'Bangalore':4, 'Pune':5,'Jaipur':6, 'Hyderabad':7, 'Ahmedabad':8, 'Kolkata':9, 'Kochi':10}},inplace=True)

In [422]:
#Encoding model column

test_df.replace({'model':{'octavia':0, 'roomster':1, 'yaris':2, 'i30':3, 'superb':4, 'avensis':5, 'yeti':6,'x1':7, 'x3':8, 'coupe':9, 'panda':10, 'qashqai':11, 'aygo':12, 'micra':13, 'q7':14,'rapid':15, 'x5':16, 'tt':18, 'q5':19, 'citigo':20, 'q3':21, 'auris':22, 'juke':23}},inplace=True)
train_df.replace({'model':{'octavia':0, 'roomster':1, 'yaris':2, 'i30':3, 'superb':4, 'avensis':5, 'yeti':6,'x1':7, 'x3':8, 'coupe':9, 'panda':10, 'qashqai':11, 'aygo':12, 'micra':13, 'q7':14,'rapid':15, 'x5':16, 'tt':18, 'q5':19, 'citigo':20, 'q3':21, 'auris':22, 'juke':23}},inplace=True)

In [423]:
test_df.head(10)

Unnamed: 0,Id,Maker,model,Location,Distance,Owner_Type,manufacture_year,Age of car,engine_displacement,engine_power,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type
0,11001,0,0,0,150000.0,1,2007,12,1595,75.0,5,0,4,5,1
1,11002,0,15,1,29376.0,3,2014,5,1598,77.0,4,0,4,5,0
2,11003,0,0,1,30563.0,2,2014,5,1968,110.0,5,0,5,5,0
3,11004,4,21,2,8650.0,4,2015,4,1968,110.0,8,1,4,5,0
4,11005,4,18,3,6400.0,3,2015,4,1984,169.0,4,1,2,0,1
5,11006,4,18,2,3000.0,3,2015,4,1968,135.0,8,0,2,2,0
6,11007,2,8,3,10.0,4,2015,4,2979,20.0,5,1,0,5,1
7,11008,4,21,4,18000.0,3,2014,5,1968,130.0,6,1,4,5,0
8,11009,0,0,4,270.0,4,2006,13,2000,103.0,5,0,0,0,1
9,11010,5,22,2,3000.0,3,2015,4,1197,85.0,5,0,4,5,1


# Train Test Split


In [424]:
from sklearn.model_selection import train_test_split

In [425]:

x = train_df.drop(['Price'],axis =1)
y = train_df['Price']

X_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [426]:
x.head(10)

Unnamed: 0,Id,Maker,model,Location,Distance,Owner_Type,manufacture_year,Age of car,engine_displacement,engine_power,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type
1,25002,1,10,8,27750.0,3,2012,7,1242,51.0,6,0,4,4,1
2,25003,2,7,7,46000.0,3,2014,5,1995,105.0,7,1,4,5,0
3,25004,3,23,0,43949.0,3,2011,8,1618,140.0,7,0,4,5,1
4,25005,2,16,6,59524.0,4,2012,7,2993,180.0,7,1,4,5,0
5,25006,0,0,3,12015.0,1,2015,4,1968,110.0,4,0,4,5,0
6,25007,0,4,1,181000.0,4,2009,10,1968,125.0,6,1,5,5,0
7,25008,1,10,3,33100.0,2,2010,9,1108,40.0,5,0,5,5,1
8,25009,3,11,1,17375.0,1,2015,4,1600,96.0,5,0,4,5,0
9,25010,0,4,0,97640.0,4,2010,9,2000,103.0,6,0,5,5,0
10,25011,0,4,10,208000.0,1,2010,9,1800,118.0,4,0,0,0,1


In [427]:
y.shape

(51119,)

# Model Training




#Linear Regression

In [428]:
lr_model=LinearRegression()
lr_model.fit(X_train , y_train)

In [429]:
y_pred = lr_model.predict(x_test)

In [430]:
lr_model.score(x_test, y_test)

0.6642534635658804

#Decision Tree

In [431]:
from sklearn.tree import DecisionTreeRegressor

In [432]:
decision = DecisionTreeRegressor()
decision.fit(X_train , y_train)


In [433]:
decision.score(x_test , y_test)

0.8459080812366899

#Random Forrest

In [434]:
from sklearn.ensemble import RandomForestRegressor

In [435]:
random_forest = RandomForestRegressor() 
random_forest.fit(X_train , y_train)

In [436]:
random_forest.score(x_test , y_test)

0.9054431871837518

In [437]:
X_train.dtypes

Id                       int64
Maker                    int64
model                    int64
Location                 int64
Distance               float64
Owner_Type               int64
manufacture_year         int64
Age of car               int64
engine_displacement      int64
engine_power           float64
Vroom Audit Rating       int64
transmission             int64
door_count               int64
seat_count               int64
fuel_type                int64
dtype: object

In [438]:
test_df.dtypes

Id                       int64
Maker                    int64
model                    int64
Location                 int64
Distance               float64
Owner_Type               int64
manufacture_year         int64
Age of car               int64
engine_displacement      int64
engine_power           float64
Vroom Audit Rating       int64
transmission             int64
door_count               int64
seat_count               int64
fuel_type                int64
dtype: object

In [439]:
y_pred = random_forest.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print('Root Mean squared error: {:.2f}'.format(rmse))
print('R2 score: {:.2f}'.format(r2))

Root Mean squared error: 258225.70
R2 score: 0.91


#XGboost

In [440]:
from xgboost import XGBRegressor

In [441]:
X_train.head(2)

Unnamed: 0,Id,Maker,model,Location,Distance,Owner_Type,manufacture_year,Age of car,engine_displacement,engine_power,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type
53071,78072,0,0,1,289300.0,4,2004,15,1900,77.0,4,1,0,0,0
41123,66124,2,16,7,164267.0,3,2012,7,2993,180.0,7,1,4,5,0


In [442]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40895 entries, 53071 to 2867
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   40895 non-null  int64  
 1   Maker                40895 non-null  int64  
 2   model                40895 non-null  int64  
 3   Location             40895 non-null  int64  
 4   Distance             40895 non-null  float64
 5   Owner_Type           40895 non-null  int64  
 6   manufacture_year     40895 non-null  int64  
 7   Age of car           40895 non-null  int64  
 8   engine_displacement  40895 non-null  int64  
 9   engine_power         40895 non-null  float64
 10  Vroom Audit Rating   40895 non-null  int64  
 11  transmission         40895 non-null  int64  
 12  door_count           40895 non-null  int64  
 13  seat_count           40895 non-null  int64  
 14  fuel_type            40895 non-null  int64  
dtypes: float64(2), int64(13)
memory u

In [443]:
# define the XGBoost model
xgb_model = XGBRegressor()

In [444]:
# train the XGBoost model
xgb_model.fit(X_train, y_train)

In [445]:
# predict on the test set
xgb_model.score(x_test, y_test)

0.9022173872215008

In [446]:
X_train.dtypes

Id                       int64
Maker                    int64
model                    int64
Location                 int64
Distance               float64
Owner_Type               int64
manufacture_year         int64
Age of car               int64
engine_displacement      int64
engine_power           float64
Vroom Audit Rating       int64
transmission             int64
door_count               int64
seat_count               int64
fuel_type                int64
dtype: object

In [447]:
test_df.dtypes

Id                       int64
Maker                    int64
model                    int64
Location                 int64
Distance               float64
Owner_Type               int64
manufacture_year         int64
Age of car               int64
engine_displacement      int64
engine_power           float64
Vroom Audit Rating       int64
transmission             int64
door_count               int64
seat_count               int64
fuel_type                int64
dtype: object

In [448]:
predict_randomforest = random_forest.predict(test_df)

In [449]:
id_testdf = list(test_df['Id'])
data = list(zip(id_testdf,predict_randomforest))

In [450]:
predicted_df = pd.DataFrame(data,columns = ['Id','Price'])

In [451]:
predicted_df.head(10)

Unnamed: 0,Id,Price
0,11001,366353.4
1,11002,964901.4
2,11003,1566627.0
3,11004,2522130.0
4,11005,3173740.0
5,11006,2816934.0
6,11007,3259811.0
7,11008,2660134.0
8,11009,417908.0
9,11010,1466211.0


In [453]:
predicted_df.to_csv('Predicted_price.csv',sep=',',index=False)