In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(0, os.path.abspath("../"))
from color import get_custom_color_palette_hash
from train_model import trainmodel, trainmodelGSCV
from z_score import zscore
from sklearn_manager import Preprocessor
from plots import plot_validation, featuresrepartition, plot_validation, countplot, barplot

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
PAL = "ch:start=.2,rot=-.3"
COLORS = ['#c7dfdf','#a3c3cb','#86a5b8','#687c9c','#494c6e', '#373451', '#29243b']

In [3]:
np.random.seed(42)

## **Lecture des données 📁**

In [4]:
data_train_global = pd.read_csv("../../../data/curated/train.csv", usecols=["Name", "Year", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type", "Transmission", "Mileage", "Engine", "Power", "Price"])
data_train_global.head()

Unnamed: 0,Name,Year,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price
0,Hyundai Creta 1.6 CRDi SX Option,2015,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.2,12.5
1,Honda Jazz V,2011,First,5,46000.0,Petrol,Manual,18.2,1199.0,88.7,4.5
2,Suzuki Ertiga VDI,2012,First,7,87000.0,Diesel,Manual,20.77,1248.0,88.76,6.0
3,Audi A4 New 2.0 TDI Multitronic,2013,Second,5,40670.0,Diesel,Automatic,15.2,1968.0,140.8,17.74
4,Nissan Micra Diesel XV,2013,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.1,3.5


## **Amélioration des modeles ↗️**

In [5]:
data2 = data_train_global.copy()

In [6]:
data2

Unnamed: 0,Name,Year,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price
0,Hyundai Creta 1.6 CRDi SX Option,2015,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.20,12.50
1,Honda Jazz V,2011,First,5,46000.0,Petrol,Manual,18.20,1199.0,88.70,4.50
2,Suzuki Ertiga VDI,2012,First,7,87000.0,Diesel,Manual,20.77,1248.0,88.76,6.00
3,Audi A4 New 2.0 TDI Multitronic,2013,Second,5,40670.0,Diesel,Automatic,15.20,1968.0,140.80,17.74
4,Nissan Micra Diesel XV,2013,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.10,3.50
...,...,...,...,...,...,...,...,...,...,...,...
5802,Suzuki Swift VDI,2014,First,5,27365.0,Diesel,Manual,28.40,1248.0,74.00,4.75
5803,Hyundai Xcent 1.1 CRDi S,2015,First,5,100000.0,Diesel,Manual,24.40,1120.0,71.00,4.00
5804,Mahindra Xylo D4 BSIV,2012,Second,8,55000.0,Diesel,Manual,14.00,2498.0,112.00,2.90
5805,Suzuki Wagon R VXI,2013,First,5,46000.0,Petrol,Manual,18.90,998.0,67.10,2.65


In [7]:
data2 = data2[data2["Kilometers_Driven"].between(left=0, right=200000)]

In [8]:
print(data2.shape)

data2 = zscore(data2, "Power")
data2 = zscore(data2, "Kilometers_Driven")
data2 = zscore(data2, "Engine")
data2 = zscore(data2, "Mileage")

print(data2.shape)

(5779, 11)
(5603, 11)


In [9]:
data2["Year"] = [2021 - x for x in data2["Year"]]
data2 = data2.rename({"Year": "Seniority"}, axis=1)

In [10]:
data2["Name"].value_counts()

Mahindra XUV500 W8 2WD                    49
Suzuki Swift VDI                          44
Honda City 1.5 S MT                       33
Suzuki Swift Dzire VDI                    33
Hyundai i10 Sportz                        30
                                          ..
Hyundai Grand i10 Magna AT                 1
Hyundai i20 2015-2017 Sportz 1.2           1
Hyundai i20 Active SX Dual Tone Petrol     1
Suzuki Swift RS VDI                        1
Audi Q7 45 TDI Quattro Premium Plus        1
Name: Name, Length: 1712, dtype: int64

In [11]:
data2["Brands"] = [x.split()[0] for x in data2["Name"]]
data2["Name"] = [" ".join(x.split()[1:]) for x in data2["Name"]]
data2["Brands"] = [str(x).replace("ISUZU","Isuzu") for x in data2["Brands"]]

data2 = data2.reindex(columns=["Brands", "Name", "Seniority", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type","Transmission", "Mileage", "Engine", "Power", "Price"])
data2.drop("Name",axis=1, inplace=True)

In [12]:
dict_country = { "Suzuki":"Japan", "Hyundai":"Korea", "Honda":"Japan", "Toyota":"Japan", "Volkswagen":"Germany",
                "Ford":"USA", "Mercedes-Benz":"Germany", "Mahindra":"India", "BMW":"Germany", "Audi":"Germany", "Tata":"India", 
                "Skoda":"Czech", "Renault":"France", "Chevrolet":"USA", "Nissan":"Japan", "Land":"England","Jaguar":"England",
                "Mini":"England","Mitsubishi":"Japan", "Fiat":"Italy", "Volvo":"Sweden", "Jeep":"USA", "Datsun":"Japan",
                "Porsche":"Germany", "Isuzu":"Japan", "Force":"India", "Ambassador":"India" 
                }

dict_continent = { "Japan":"Asia", "Korea":"Asia", "Germany":"Europe","USA":"America", "India":"Asia",  
                "Czech":"Europe", "France":"Europe", "England":"Europe", "Italy":"Europe","Sweden":"Europe"}

data2["Country"] = data2["Brands"].map(dict_country)
data2["Continent"] = data2["Country"].map(dict_continent)

data2.head()


Unnamed: 0,Brands,Seniority,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price,Country,Continent
0,Hyundai,6,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.2,12.5,Korea,Asia
1,Honda,10,First,5,46000.0,Petrol,Manual,18.2,1199.0,88.7,4.5,Japan,Asia
2,Suzuki,9,First,7,87000.0,Diesel,Manual,20.77,1248.0,88.76,6.0,Japan,Asia
3,Audi,8,Second,5,40670.0,Diesel,Automatic,15.2,1968.0,140.8,17.74,Germany,Europe
4,Nissan,8,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.1,3.5,Japan,Asia


In [13]:
data2["Continent"] = [str(x).replace("Asia", "1") for x in data2["Continent"]]
data2["Continent"] = [str(x).replace("Europe", "2") for x in data2["Continent"]]
data2["Continent"] = [str(x).replace("America", "3") for x in data2["Continent"]]

data2["Continent"]= data2["Continent"].astype(np.int64)

In [14]:
data2["Owner_Type"] = [str(x).replace("First", "1") for x in data2["Owner_Type"]]
data2["Owner_Type"] = [str(x).replace("Second", "2") for x in data2["Owner_Type"]]
data2["Owner_Type"] = [str(x).replace("Third", "3") for x in data2["Owner_Type"]]
data2["Owner_Type"] = [str(x).replace("Fourth & Above", "4") for x in data2["Owner_Type"]]

data2["Owner_Type"]= data2["Owner_Type"].astype(np.int64)

In [15]:
### Ne garder que les voitures à 5 ou 7 places

# print(data2.shape)
# data2 = data2.loc[data2["Seats"].isin([5,7])]
# print(data2.shape)

In [16]:
TARGET2 = data2["Price"]

In [17]:
### Afin d'entrainer mon modele je supprime ma TARGET
data2.drop("Price", axis=1, inplace=True)

In [19]:
data2_preprocess = data2[["Brands", "Seniority", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type","Transmission", "Mileage", "Engine", "Power", "Continent"]]

preprocessor_test = Preprocessor()
preprocessor_test.fit(data2_preprocess)
preprocessor_test.save("../../../data/preprocessor/preprocessor_country_continent")

In [20]:
NUM_COL_LIST = data2_preprocess.select_dtypes(exclude='object').columns.tolist()

CAT_COL_LIST = []
for col in data2_preprocess.select_dtypes(include='object').columns:
    CAT_COL_LIST += [col + '*' + elem for elem in data2[col].unique().tolist()]

data2_preprocessed = pd.DataFrame(preprocessor_test.transform(data2_preprocess), columns = NUM_COL_LIST + CAT_COL_LIST)

print(f"DATA shape: {str(data2.shape)}\nDATA shape:: {str(data2_preprocessed.shape)}")
data2_preprocessed.head()

DATA shape: (5603, 12)
DATA shape:: (5603, 39)


Unnamed: 0,Seniority,Owner_Type,Seats,Kilometers_Driven,Mileage,Engine,Power,Continent,Brands*Hyundai,Brands*Honda,...,Brands*Fiat,Brands*Jeep,Brands*Porsche,Brands*Ambassador,Brands*Isuzu,Brands*Force,Fuel_Type*Diesel,Fuel_Type*Petrol,Transmission*Manual,Transmission*Automatic
0,0.190476,0.0,0.375,0.270697,0.567822,0.372183,0.388137,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.380952,0.0,0.375,0.303847,0.49505,0.223388,0.229929,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.333333,0.0,0.625,0.575678,0.622277,0.242424,0.230182,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.285714,0.333333,0.375,0.268509,0.346535,0.522145,0.449732,0.5,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.285714,0.0,0.375,0.575672,0.736634,0.325175,0.121925,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(data2_preprocessed, TARGET2, test_size=0.2 , shuffle=True)
print(f"X_train shape: {str(X_train.shape)} \nX_test shape: {str(X_test.shape)}")

X_train shape: (4482, 39) 
X_test shape: (1121, 39)


In [22]:
model_ll2 = trainmodel(LinearRegression, X_train, y_train, X_test, y_test)

Score du jeu TRAIN
MAE:  2.7121828246053092
RMSE:  4.801201982486494
Median abs err:  1.591800809722454
R2:  0.7764493210361274

Score du jeu TEST
MAE:  2.8257265229555313
RMSE:  4.734181468188325
Median abs err:  1.738023349574445
R2:  0.7832842006706361


In [23]:
model_rfr2 = trainmodel(RandomForestRegressor, X_train, y_train, X_test, y_test)

param_grid = [
                {
                    "max_depth" : [80, 90, 100, None],
                    "n_estimators" : [20, 50, 100],
                    "max_features": ["auto", "sqrt"],
                    "criterion" : ["squared_error", "absolute_error", "poisson"]
                }
            ]

grid_rfr2 = trainmodelGSCV(model_rfr2, X_train, y_train, param_grid)

Score du jeu TRAIN
MAE:  0.5238140854583959
RMSE:  1.1991515391528822
Median abs err:  0.2555500000000013
R2:  0.9860548182726212

Score du jeu TEST
MAE:  1.387021287907751
RMSE:  2.82449851490327
Median abs err:  0.6957999999999913
R2:  0.9228593181381651

Score après GridSearchCV
CV Mean:  0.9028246524621906
STD:  0.041720706440941134
Le meilleur score est de: 0.8844208369372121 avec {'criterion': 'poisson', 'max_depth': 100, 'max_features': 'sqrt', 'n_estimators': 100}
