In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(0, os.path.abspath("../"))
from color import get_custom_color_palette_hash
from train_model import trainmodel, trainmodelGSCV
from z_score import zscore
from sklearn_manager import Preprocessor
from plots import plot_validation, featuresrepartition, plot_validation, countplot, barplot

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
PAL = "ch:start=.2,rot=-.3"
COLORS = ['#c7dfdf','#a3c3cb','#86a5b8','#687c9c','#494c6e', '#373451', '#29243b']

In [3]:
np.random.seed(42)

In [4]:
data_train_global = pd.read_csv("../../../data/curated/train.csv", usecols=["Name", "Year", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type", "Transmission", "Mileage", "Engine", "Power", "Price"])
print(data_train_global.shape)
data_train_global.head()

(5807, 11)


Unnamed: 0,Name,Year,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price
0,Hyundai Creta 1.6 CRDi SX Option,2015,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.2,12.5
1,Honda Jazz V,2011,First,5,46000.0,Petrol,Manual,18.2,1199.0,88.7,4.5
2,Suzuki Ertiga VDI,2012,First,7,87000.0,Diesel,Manual,20.77,1248.0,88.76,6.0
3,Audi A4 New 2.0 TDI Multitronic,2013,Second,5,40670.0,Diesel,Automatic,15.2,1968.0,140.8,17.74
4,Nissan Micra Diesel XV,2013,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.1,3.5


In [5]:
data = data_train_global.copy()
data5 = data[data["Seats"]==5]

In [6]:
print(data5.shape)

data5 = zscore(data5, "Power")
data5 = zscore(data5, "Kilometers_Driven")
data5 = zscore(data5, "Engine")
data5 = zscore(data5, "Mileage")

print(data5.shape)

(4864, 11)
(4638, 11)


In [7]:
data5["Year"] = [2021 - x for x in data5["Year"]]
data5 = data5.rename({"Year": "Seniority"}, axis=1)

In [8]:
data5["Brands"] = [x.split()[0] for x in data5["Name"]]
data5["Name"] = [" ".join(x.split()[1:]) for x in data5["Name"]]
data5["Brands"] = [str(x).replace("ISUZU","Isuzu") for x in data5["Brands"]]

data5 = data5.reindex(columns=["Brands", "Name", "Seniority", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type","Transmission", "Mileage", "Engine", "Power", "Price"])
data5.drop("Name",axis=1, inplace=True)

In [9]:
dict_country = { "Suzuki":"Japan", "Hyundai":"Korea", "Honda":"Japan", "Toyota":"Japan", "Volkswagen":"Germany",
                "Ford":"USA", "Mercedes-Benz":"Germany", "Mahindra":"India", "BMW":"Germany", "Audi":"Germany", "Tata":"India", 
                "Skoda":"Czech", "Renault":"France", "Chevrolet":"USA", "Nissan":"Japan", "Land":"England","Jaguar":"England",
                "Mini":"England","Mitsubishi":"Japan", "Fiat":"Italy", "Volvo":"Sweden", "Jeep":"USA", "Datsun":"Japan",
                "Porsche":"Germany", "Isuzu":"Japan", "Force":"India", "Ambassador":"India" 
                }

dict_continent = { "Japan":"Asia", "Korea":"Asia", "Germany":"Europe","USA":"America", "India":"Asia",  
                "Czech":"Europe", "France":"Europe", "England":"Europe", "Italy":"Europe","Sweden":"Europe"}

data5["Country"] = data5["Brands"].map(dict_country)
data5["Continent"] = data5["Country"].map(dict_continent)

data5.head()


Unnamed: 0,Brands,Seniority,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price,Country,Continent
0,Hyundai,6,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.2,12.5,Korea,Asia
1,Honda,10,First,5,46000.0,Petrol,Manual,18.2,1199.0,88.7,4.5,Japan,Asia
2,Audi,8,Second,5,40670.0,Diesel,Automatic,15.2,1968.0,140.8,17.74,Germany,Europe
3,Nissan,8,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.1,3.5,Japan,Asia
4,Volkswagen,8,First,5,64430.0,Diesel,Manual,20.54,1598.0,103.6,5.2,Germany,Europe


In [11]:
data5["Continent"] = [str(x).replace("Asia", "1") for x in data5["Continent"]]
data5["Continent"] = [str(x).replace("Europe", "2") for x in data5["Continent"]]
data5["Continent"] = [str(x).replace("America", "3") for x in data5["Continent"]]

data5["Continent"]= data5["Continent"].astype(np.int64)

In [12]:
data5["Owner_Type"] = [str(x).replace("First", "1") for x in data5["Owner_Type"]]
data5["Owner_Type"] = [str(x).replace("Second", "2") for x in data5["Owner_Type"]]
data5["Owner_Type"] = [str(x).replace("Third", "3") for x in data5["Owner_Type"]]
data5["Owner_Type"] = [str(x).replace("Fourth & Above", "4") for x in data5["Owner_Type"]]

data5["Owner_Type"]= data5["Owner_Type"].astype(np.int64)

In [13]:
TARGET2 = data5["Price"]

In [14]:
### Afin d'entrainer mon modele je supprime ma TARGET
data5.drop("Price", axis=1, inplace=True)

In [21]:
data5_preprocess = data5[["Brands", "Seniority", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type","Transmission", "Mileage", "Engine", "Power"]]

preprocessor_test = Preprocessor()
preprocessor_test.fit(data5_preprocess)
preprocessor_test.save("../../../data/preprocessor/preprocessor5seats")

In [22]:
NUM_COL_LIST = data5_preprocess.select_dtypes(exclude='object').columns.tolist()

CAT_COL_LIST = []
for col in data5_preprocess.select_dtypes(include='object').columns:
    CAT_COL_LIST += [col + '*' + elem for elem in data5[col].unique().tolist()]

data5_preprocessed = pd.DataFrame(preprocessor_test.transform(data5_preprocess), columns = NUM_COL_LIST + CAT_COL_LIST)

print(f"DATA shape: {str(data5.shape)}\nDATA shape:: {str(data5_preprocessed.shape)}")
data5_preprocessed.head()

DATA shape: (4638, 12)
DATA shape:: (4638, 36)


Unnamed: 0,Seniority,Owner_Type,Seats,Kilometers_Driven,Mileage,Engine,Power,Brands*Hyundai,Brands*Honda,Brands*Audi,...,Brands*Fiat,Brands*Jaguar,Brands*Mitsubishi,Brands*Jeep,Brands*Ambassador,Brands*Isuzu,Fuel_Type*Diesel,Fuel_Type*Petrol,Transmission*Manual,Transmission*Automatic
0,0.190476,0.0,0.0,0.254597,0.55,0.414173,0.43286,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.380952,0.0,0.0,0.285776,0.474227,0.213123,0.256422,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.285714,0.333333,0.0,0.252539,0.319588,0.616798,0.501553,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.285714,0.0,0.0,0.541433,0.725773,0.350656,0.135974,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.285714,0.0,0.0,0.4007,0.594845,0.422572,0.326527,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(data5_preprocessed, TARGET2, test_size=0.2 , shuffle=True)
print(f"X_train shape: {str(X_train.shape)} \nX_test shape: {str(X_test.shape)}")

X_train shape: (3710, 36) 
X_test shape: (928, 36)


In [24]:
model_ll = trainmodel(LinearRegression, X_train, y_train, X_test, y_test)

Score du jeu TRAIN
MAE:  2.0038810436320755
RMSE:  3.410392400135571
Median abs err:  1.1696093749999992
R2:  0.8083819442115114

Score du jeu TEST
MAE:  2.0980156418372844
RMSE:  3.7424301412722603
Median abs err:  1.1884375
R2:  0.8143431058239918


In [25]:
model_rfr = trainmodel(RandomForestRegressor, X_train, y_train, X_test, y_test)

param_grid = [
                {
                    "max_depth" : [80, 90, 100, None],
                    "n_estimators" : [20, 50, 100],
                    "max_features": ["auto", "sqrt"],
                    "criterion" : ["squared_error", "absolute_error", "poisson"]
                }
            ]

grid_rfr = trainmodelGSCV(model_rfr, X_train, y_train, param_grid)

Score du jeu TRAIN
MAE:  0.43256468052881536
RMSE:  0.8060977534075893
Median abs err:  0.23474999999999735
R2:  0.9892945938818761

Score du jeu TEST
MAE:  1.2460704926108377
RMSE:  2.6066494335744723
Median abs err:  0.6476999999999988
R2:  0.9099323294259332

Score après GridSearchCV
CV Mean:  0.9176699229800473
STD:  0.006406666007479956
Le meilleur score est de: 0.9100038244742403 avec {'criterion': 'poisson', 'max_depth': 90, 'max_features': 'sqrt', 'n_estimators': 100}
