<a href="https://colab.research.google.com/github/yashveersinghsohi/Car_Price_Prediction/blob/master/Modeling/CarPrice_06_Prediction_on_Test_set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Packages

In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import pickle

# Data

In [2]:
root_dir = "https://raw.githubusercontent.com/yashveersinghsohi/Car_Price_Prediction/master/Data/EDA_Data/"

test_features_path = root_dir + "test_features.csv"
test_targets_path = root_dir + "test_targets.csv"

test_features = pd.read_csv(test_features_path)
test_targets = pd.read_csv(test_targets_path)

print(f"Test Features: {test_features.shape}")
print(f"Test Targets: {test_targets.shape}")

Test Features: (1924, 16)
Test Targets: (1924, 1)


In [3]:
root_dir = "https://raw.githubusercontent.com/yashveersinghsohi/Car_Price_Prediction/master/Data/EDA_Data/"

train_features_path = root_dir + "train_features.csv"
train_targets_path = root_dir + "train_targets.csv"

val_features_path = root_dir + "val_features.csv"
val_targets_path = root_dir + "val_targets.csv"

train_features = pd.read_csv(train_features_path)
train_targets = pd.read_csv(train_targets_path)

val_features = pd.read_csv(val_features_path)
val_targets = pd.read_csv(val_targets_path)

print(f"Train Features: {train_features.shape}")
print(f"Train Targets: {train_targets.shape}", end="\n\n")

print(f"Validation Features: {val_features.shape}")
print(f"Validation Targets: {val_targets.shape}")

Train Features: (13850, 16)
Train Targets: (13850, 1)

Validation Features: (3463, 16)
Validation Targets: (3463, 1)


In [4]:
root_dir = "https://raw.githubusercontent.com/yashveersinghsohi/Car_Price_Prediction/master/Data/Feature_Engineering_Data/"

train_features_path = root_dir + "train_features_df.csv"
train_targets_path = root_dir + "train_targets.csv"

val_features_path = root_dir + "val_features_df.csv"
val_targets_path = root_dir + "val_targets.csv"

train_features_df = pd.read_csv(train_features_path)
train_targets = pd.read_csv(train_targets_path)

val_features_df = pd.read_csv(val_features_path)
val_targets = pd.read_csv(val_targets_path)

print(f"Train Features: {train_features_df.shape}")
print(f"Train Targets: {train_targets.shape}", end="\n\n")

print(f"Validation Features: {val_features_df.shape}")
print(f"Validation Targets: {val_targets.shape}")

Train Features: (13351, 30)
Train Targets: (13351, 1)

Validation Features: (3463, 30)
Validation Targets: (3463, 1)


# Data Processing

Using the same steps used in feature engineering of Train and Val sets

In [5]:
test_features.columns

Index(['Prod. year', 'Cylinders', 'Airbags', 'Levy', 'Manufacturer', 'Model',
       'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',
       'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color'],
      dtype='object')

In [6]:
test_features_df = pd.DataFrame()

Prod. year

In [7]:
test_features_df["prod_year"] = test_features["Prod. year"]
test_features_df["prod_year_delta"] = (test_features_df["prod_year"] - 2020).abs()
test_features_df["prod_year_delta_sq"] = np.power(test_features_df["prod_year_delta"].to_numpy(), 2)
test_features_df["prod_year_delta_cu"] = np.power(test_features_df["prod_year_delta"].to_numpy(), 3)

test_features_df["prod_year_new"] = np.where(
    test_features_df["prod_year_delta"]<1, 
    1, 0
  )
test_features_df["prod_year_new"] = np.where(
    (test_features_df["prod_year_delta"]<4) & (test_features_df["prod_year_delta"]>=1), 
    2, test_features_df["prod_year_new"]
  )
test_features_df["prod_year_new"] = np.where(
    (test_features_df["prod_year_delta"]<6) & (test_features_df["prod_year_delta"]>=4), 
    3, test_features_df["prod_year_new"]
  )
test_features_df["prod_year_new"] = np.where(
    (test_features_df["prod_year_delta"]<8) & (test_features_df["prod_year_delta"]>=6), 
    4, test_features_df["prod_year_new"]
  )
test_features_df["prod_year_new"] = np.where(
    (test_features_df["prod_year_delta"]<10) & (test_features_df["prod_year_delta"]>=8), 
    5, test_features_df["prod_year_new"]
  )
test_features_df["prod_year_new"] = np.where(
    test_features_df["prod_year_delta"]>=10, 
    6, test_features_df["prod_year_new"]
  )

Cylinders, Airbags

In [8]:
test_features_df["cylinders"] = test_features["Cylinders"].astype(np.int)
test_features_df["airbags"] = test_features["Airbags"].astype(np.int)

Levy

In [9]:
test_features_df["levy"] = test_features["Levy"].replace("-", np.nan).astype(np.float)
test_features_df["levy"] = test_features_df["levy"].fillna(train_features_df["levy"].median())

Manufacturer

In [10]:
class_1 = ['ROLLS-ROYCE', 'MOSKVICH', 'ROVER', 'GMC', 'ACURA', 'VAZ', 'ZAZ', 
           'BUICK', 'UAZ', 'SEAT', 'DAIHATSU', 'OPEL', 'CHRYSLER', 'GAZ', 
           'DAEWOO', 'LEXUS', 'PONTIAC', 'MAZDA', 'AUDI', 'VOLVO', 'ISUZU', 
           'PEUGEOT', 'NISSAN', 'SUBARU', 'ALFA ROMEO', 'SUZUKI', 'MITSUBISHI', 
           'DODGE', 'HONDA', 'GREATWALL']
class_2 = ['RENAULT', 'MERCEDES-BENZ', 'VOLKSWAGEN', 'SKODA', 
           'FIAT', 'TOYOTA', 'CADILLAC', 'FORD', 'CHEVROLET', 'BMW', 'JAGUAR',
           'LINCOLN', 'MERCURY', 'CITROEN', 'SCION', 'KIA', 'სხვა', 'HYUNDAI',
           'MINI', 'INFINITI']
class_3 = ['PORSCHE', 'MASERATI', 'JEEP', 'LAND ROVER', 'SSANGYONG', 'FERRARI']
class_4 = ['HUMMER', 'TESLA', 'ASTON MARTIN']

test_features_df["manufacturer_class"] = np.where(
    test_features["Manufacturer"].isin(class_1), 
    1, 0
  )
test_features_df["manufacturer_class"] = np.where(
    test_features["Manufacturer"].isin(class_2), 
    2, test_features_df["manufacturer_class"]
  )
test_features_df["manufacturer_class"] = np.where(
    test_features["Manufacturer"].isin(class_3), 
    3, test_features_df["manufacturer_class"]
  )
test_features_df["manufacturer_class"] = np.where(
    test_features["Manufacturer"].isin(class_4), 
    4, test_features_df["manufacturer_class"]
  )

In [11]:
temp_df = pd.DataFrame()
temp_df["Manufacturer"] = train_features["Manufacturer"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Manufacturer")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Manufacturer")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["manufacturer_mean_le"] = test_features["Manufacturer"].map(mean_map).fillna(0)
test_features_df["manufacturer_median_le"] = test_features["Manufacturer"].map(median_map).fillna(0)

Category

In [12]:
temp_df = pd.DataFrame()
temp_df["Category"] = train_features["Category"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Category")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Category")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["category_mean_le"] = test_features["Category"].map(mean_map).fillna(0)
test_features_df["category_median_le"] = test_features["Category"].map(median_map).fillna(0)

Leather interior

In [13]:
test_features_df["leather_interior"] = np.where(
    test_features["Leather interior"] == "Yes", 
    1, 0
  )

Fuel type

In [14]:
temp_df = pd.DataFrame()
temp_df["Fuel type"] = train_features["Fuel type"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Fuel type")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Fuel type")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["fuel_type_mean_le"] = test_features["Fuel type"].map(mean_map).fillna(0)
test_features_df["fuel_type_median_le"] = test_features["Fuel type"].map(median_map).fillna(0)

Engine volume

In [15]:
test_features_df["turbo"] = np.where(
    test_features["Engine volume"].str.contains("Turbo"), 
    1, 0
  )

In [16]:
test_features_df["engine_volume"] = test_features["Engine volume"].str.replace(" Turbo", "").astype(np.float)

Mileage

In [17]:
test_features_df["mileage"] = test_features["Mileage"].str.replace(" km", "").astype(np.float)
test_features_df["mileage_new"] = np.where(test_features_df["mileage"] == 0, 1, 0)

Gear box type

In [18]:
temp_df = pd.DataFrame()
temp_df["Gear box type"] = train_features["Gear box type"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Gear box type")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Gear box type")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["gear_box_type_mean_le"] = test_features["Gear box type"].map(mean_map).fillna(0)
test_features_df["gear_box_type_median_le"] = test_features["Gear box type"].map(median_map).fillna(0)

Drive wheels

In [19]:
temp_df = pd.DataFrame()
temp_df["Drive wheels"] = train_features["Drive wheels"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Drive wheels")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Drive wheels")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["drive_wheels_mean_le"] = test_features["Drive wheels"].map(mean_map).fillna(0)
test_features_df["drive_wheels_median_le"] = test_features["Drive wheels"].map(median_map).fillna(0)

Doors

In [20]:
temp_df = pd.DataFrame()
temp_df["Doors"] = train_features["Doors"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Doors")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Doors")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["doors_mean_le"] = test_features["Doors"].map(mean_map).fillna(0)
test_features_df["doors_median_le"] = test_features["Doors"].map(median_map).fillna(0)

Wheel

In [21]:
temp_df = pd.DataFrame()
temp_df["Wheel"] = train_features["Wheel"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Wheel")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Wheel")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["wheel_mean_le"] = test_features["Wheel"].map(mean_map).fillna(0)
test_features_df["wheel_median_le"] = test_features["Wheel"].map(median_map).fillna(0)

Color

In [22]:
temp_df = pd.DataFrame()
temp_df["Color"] = train_features["Color"]
temp_df["Price"] = train_targets["Price"]

mean_order = temp_df.groupby(by="Color")["Price"].mean().index
mean_map = {label: idx for idx, label in enumerate(mean_order)}

median_order = temp_df.groupby(by="Color")["Price"].median().index
median_map = {label: idx for idx, label in enumerate(median_order)}

test_features_df["color_mean_le"] = test_features["Color"].map(mean_map).fillna(0)
test_features_df["color_median_le"] = test_features["Color"].map(median_map).fillna(0)

# Sanity Checks

In [27]:
test_features_df.head()

Unnamed: 0,prod_year,prod_year_delta,prod_year_delta_sq,prod_year_delta_cu,cylinders,airbags,levy,manufacturer_class,manufacturer_mean_le,category_mean_le,leather_interior,fuel_type_mean_le,turbo,engine_volume,mileage,mileage_new,gear_box_type_mean_le,wheel_mean_le
0,2014,6,36,216,4,10,259.0,2,8.0,3,0,5,0,1.4,65000.0,0,0,0
1,1997,23,529,12167,6,2,781.0,2,33.0,6,1,1,1,2.9,3333.0,0,1,0
2,1996,24,576,13824,8,2,781.0,2,33.0,9,0,4,0,1.8,212485.0,0,1,0
3,2014,6,36,216,4,4,639.0,2,22.0,4,1,1,0,2.0,132756.0,0,0,0
4,2017,3,9,27,4,4,831.0,3,49.0,4,1,4,0,1.6,50750.0,0,0,0


In [24]:
test_features_df.isnull().sum()

prod_year                  0
prod_year_delta            0
prod_year_delta_sq         0
prod_year_delta_cu         0
prod_year_new              0
cylinders                  0
airbags                    0
levy                       0
manufacturer_class         0
manufacturer_mean_le       0
manufacturer_median_le     0
category_mean_le           0
category_median_le         0
leather_interior           0
fuel_type_mean_le          0
fuel_type_median_le        0
turbo                      0
engine_volume              0
mileage                    0
mileage_new                0
gear_box_type_mean_le      0
gear_box_type_median_le    0
drive_wheels_mean_le       0
drive_wheels_median_le     0
doors_mean_le              0
doors_median_le            0
wheel_mean_le              0
wheel_median_le            0
color_mean_le              0
color_median_le            0
dtype: int64

# Feature Pruning

In [25]:
features_to_keep = [
    "prod_year", "prod_year_delta", 
    "prod_year_delta_sq", "prod_year_delta_cu", 
    "cylinders", "airbags", "levy", 
    "manufacturer_class", "manufacturer_mean_le", 
    "category_mean_le", "leather_interior", 
    "fuel_type_mean_le", "turbo", "engine_volume", 
    "mileage", "mileage_new", 
    "gear_box_type_mean_le", "wheel_mean_le"
]
len(features_to_keep)

18

In [26]:
test_features_df_og = test_features_df.copy()
test_features_df = test_features_df.loc[:, features_to_keep]
test_features_df.shape

(1924, 18)

# Predictions

Data

In [37]:
X_test = test_features_df.to_numpy()
y_test = test_targets.to_numpy()

Model

In [30]:
!unzip model.sav.zip

Archive:  model.sav.zip
  inflating: model.sav               
  inflating: __MACOSX/._model.sav    


In [33]:
model_file = 'model.sav'
loaded_model = pickle.load(open(model_file, 'rb'))



Predictions

In [42]:
# test_preds = loaded_model.predict(X_test)
test_preds = np.where(loaded_model.predict(X_test)>0, loaded_model.predict(X_test), 0)

Evaluation

In [43]:
test_rmse = np.sqrt(mean_squared_error(y_true=y_test, y_pred=test_preds))
print(f"Test RMSE: {test_rmse}")

Test RMSE: 10543.925662683565


In [45]:
test_rmse = np.sqrt(mean_squared_log_error(y_true=y_test, y_pred=test_preds))
print(f"Test RMSLE: {test_rmse}")

Test RMSLE: 0.9950522186493146
