In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(0, os.path.abspath("../"))
from color import get_custom_color_palette_hash
from train_model import trainmodel, trainmodelGSCV
from z_score import zscore
from sklearn_manager import Preprocessor
from plots import plot_validation, featuresrepartition, plot_validation, countplot, barplot

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
PAL = "ch:start=.2,rot=-.3"
COLORS = ['#c7dfdf','#a3c3cb','#86a5b8','#687c9c','#494c6e', '#373451', '#29243b']

In [3]:
np.random.seed(42)

In [4]:
data_train_global = pd.read_csv("../../../data/curated/train.csv", usecols=["Name", "Year", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type", "Transmission", "Mileage", "Engine", "Power", "Price"])
data_train_global.head()

Unnamed: 0,Name,Year,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price
0,Hyundai Creta 1.6 CRDi SX Option,2015,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.2,12.5
1,Honda Jazz V,2011,First,5,46000.0,Petrol,Manual,18.2,1199.0,88.7,4.5
2,Suzuki Ertiga VDI,2012,First,7,87000.0,Diesel,Manual,20.77,1248.0,88.76,6.0
3,Audi A4 New 2.0 TDI Multitronic,2013,Second,5,40670.0,Diesel,Automatic,15.2,1968.0,140.8,17.74
4,Nissan Micra Diesel XV,2013,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.1,3.5


In [5]:
data2 = data_train_global.copy()

In [6]:
data2

Unnamed: 0,Name,Year,Owner_Type,Seats,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Price
0,Hyundai Creta 1.6 CRDi SX Option,2015,First,5,41000.0,Diesel,Manual,19.67,1582.0,126.20,12.50
1,Honda Jazz V,2011,First,5,46000.0,Petrol,Manual,18.20,1199.0,88.70,4.50
2,Suzuki Ertiga VDI,2012,First,7,87000.0,Diesel,Manual,20.77,1248.0,88.76,6.00
3,Audi A4 New 2.0 TDI Multitronic,2013,Second,5,40670.0,Diesel,Automatic,15.20,1968.0,140.80,17.74
4,Nissan Micra Diesel XV,2013,First,5,86999.0,Diesel,Manual,23.08,1461.0,63.10,3.50
...,...,...,...,...,...,...,...,...,...,...,...
5802,Suzuki Swift VDI,2014,First,5,27365.0,Diesel,Manual,28.40,1248.0,74.00,4.75
5803,Hyundai Xcent 1.1 CRDi S,2015,First,5,100000.0,Diesel,Manual,24.40,1120.0,71.00,4.00
5804,Mahindra Xylo D4 BSIV,2012,Second,8,55000.0,Diesel,Manual,14.00,2498.0,112.00,2.90
5805,Suzuki Wagon R VXI,2013,First,5,46000.0,Petrol,Manual,18.90,998.0,67.10,2.65


In [7]:
data2 = data2[data2["Kilometers_Driven"].between(left=0, right=300000)]

In [8]:
print(data2.shape)

data2 = zscore(data2, "Power")
data2 = zscore(data2, "Kilometers_Driven")
data2 = zscore(data2, "Engine")
data2 = zscore(data2, "Mileage")

print(data2.shape)

(5800, 11)
(5617, 11)


In [9]:
data2["Year"] = [2021 - x for x in data2["Year"]]
data2 = data2.rename({"Year": "Seniority"}, axis=1)

In [10]:
data2["Name"].value_counts()

Mahindra XUV500 W8 2WD                       49
Suzuki Swift VDI                             44
Honda City 1.5 S MT                          33
Suzuki Swift Dzire VDI                       33
Suzuki Ritz VDi                              30
                                             ..
Hyundai i20 Active SX Dual Tone Petrol        1
Mercedes-Benz E-Class 250 D W 210             1
Datsun redi-GO S                              1
Volkswagen Vento Magnific 1.6 Comfortline     1
Hyundai i20 Active 1.2 S                      1
Name: Name, Length: 1715, dtype: int64

In [11]:
data2["Brands"] = [x.split()[0] for x in data2["Name"]]
data2["Name"] = [" ".join(x.split()[1:]) for x in data2["Name"]]
data2["Brands"] = [str(x).replace("ISUZU","Isuzu") for x in data2["Brands"]]

data2 = data2.reindex(columns=["Brands", "Name", "Seniority", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type","Transmission", "Mileage", "Engine", "Power", "Price"])
data2.drop("Name",axis=1, inplace=True)

In [12]:
# dict_country = { "Suzuki":"Japan", "Hyundai":"Korea", "Honda":"Japan", "Toyota":"Japan", "Volkswagen":"Germany",
#                 "Ford":"USA", "Mercedes-Benz":"Germany", "Mahindra":"India", "BMW":"Germany", "Audi":"Germany", "Tata":"India", 
#                 "Skoda":"Czech", "Renault":"France", "Chevrolet":"USA", "Nissan":"Japan", "Land":"England","Jaguar":"England",
#                 "Mini":"England","Mitsubishi":"Japan", "Fiat":"Italy", "Volvo":"Sweden", "Jeep":"USA", "Datsun":"Japan",
#                 "Porsche":"Germany", "Isuzu":"Japan", "Force":"India", "Ambassador":"India" 
#                 }

# dict_continent = { "Japan":"Asia", "Korea":"Asia", "Germany":"Europe","USA":"America", "India":"Asia",  
#                 "Czech":"Europe", "France":"Europe", "England":"Europe", "Italy":"Europe","Sweden":"Europe"}

# data2["Country"] = data2["Brands"].map(dict_country)
# data2["Continent"] = data2["Country"].map(dict_continent)

# data2.head()


In [13]:
# data2["Continent"] = [str(x).replace("Asia", "1") for x in data2["Continent"]]
# data2["Continent"] = [str(x).replace("Europe", "2") for x in data2["Continent"]]
# data2["Continent"] = [str(x).replace("America", "3") for x in data2["Continent"]]

# data2["Continent"]= data2["Continent"].astype(np.int64)

In [14]:
data2["Owner_Type"] = [str(x).replace("First", "1") for x in data2["Owner_Type"]]
data2["Owner_Type"] = [str(x).replace("Second", "2") for x in data2["Owner_Type"]]
data2["Owner_Type"] = [str(x).replace("Third", "3") for x in data2["Owner_Type"]]
data2["Owner_Type"] = [str(x).replace("Fourth & Above", "4") for x in data2["Owner_Type"]]

data2["Owner_Type"]= data2["Owner_Type"].astype(np.int64)

In [15]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5617 entries, 0 to 5616
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brands             5617 non-null   object 
 1   Seniority          5617 non-null   int64  
 2   Owner_Type         5617 non-null   int64  
 3   Seats              5617 non-null   int64  
 4   Kilometers_Driven  5617 non-null   float64
 5   Fuel_Type          5617 non-null   object 
 6   Transmission       5617 non-null   object 
 7   Mileage            5617 non-null   float64
 8   Engine             5617 non-null   float64
 9   Power              5617 non-null   float64
 10  Price              5617 non-null   float64
dtypes: float64(5), int64(3), object(3)
memory usage: 482.8+ KB


In [16]:
### Ne garder que les voitures à 5 ou 7 places

print(data2.shape)
data2 = data2.loc[data2["Seats"].isin([5])]
print(data2.shape)

(5617, 11)
(4765, 11)


In [17]:
TARGET2 = data2["Price"]

In [18]:
### Afin d'entrainer mon modele je supprime ma TARGET
data2.drop("Price", axis=1, inplace=True)

In [19]:
data2_preprocess = data2[["Brands", "Seniority", "Owner_Type", "Seats", "Kilometers_Driven", "Fuel_Type","Transmission", "Mileage", "Engine", "Power"]]

preprocessor_test = Preprocessor()
preprocessor_test.fit(data2_preprocess)

ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000002134BEFC9D0>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000002134BEFC370>)])

In [20]:
NUM_COL_LIST = data2_preprocess.select_dtypes(exclude='object').columns.tolist()

CAT_COL_LIST = []
for col in data2_preprocess.select_dtypes(include='object').columns:
    CAT_COL_LIST += [col + '*' + elem for elem in data2[col].unique().tolist()]

data2_preprocessed = pd.DataFrame(preprocessor_test.transform(data2_preprocess), columns = NUM_COL_LIST + CAT_COL_LIST)

print(f"DATA shape: {str(data2.shape)}\nDATA shape:: {str(data2_preprocessed.shape)}")
data2_preprocessed.head()

DATA shape: (4765, 10)
DATA shape:: (4765, 37)


Unnamed: 0,Seniority,Owner_Type,Seats,Kilometers_Driven,Mileage,Engine,Power,Brands*Hyundai,Brands*Honda,Brands*Audi,...,Brands*Fiat,Brands*Mitsubishi,Brands*Jeep,Brands*Ambassador,Brands*Porsche,Brands*Isuzu,Fuel_Type*Diesel,Fuel_Type*Petrol,Transmission*Manual,Transmission*Automatic
0,0.190476,0.0,0.0,0.261659,0.556853,0.357985,0.388678,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.380952,0.0,0.0,0.293702,0.482234,0.184211,0.230249,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.285714,0.333333,0.0,0.259544,0.329949,0.533122,0.450359,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.285714,0.0,0.0,0.556451,0.729949,0.303085,0.122095,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.285714,0.0,0.0,0.411814,0.601015,0.365245,0.293198,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(data2_preprocessed, TARGET2, test_size=0.2 , shuffle=True)
print(f"X_train shape: {str(X_train.shape)} \nX_test shape: {str(X_test.shape)}")

X_train shape: (3812, 37) 
X_test shape: (953, 37)


In [22]:
model_ll2 = trainmodel(LinearRegression, X_train, y_train, X_test, y_test)

# plot_ll2 = plot_validation(model_ll2, X_train, y_train, X_test, y_test, TARGET2, "RandomForestRegressor", COLORS[2], COLORS[6])

Score du jeu TRAIN
MAE:  2.5471065302859484
RMSE:  4.70694883171886
Median abs err:  1.4583957381820554
R2:  0.7651387743229726

Score du jeu TEST
MAE:  2.5179173498217167
RMSE:  4.342090918937519
Median abs err:  1.3960193975158672
R2:  0.7716212478844171


In [23]:
model_rfr2 = trainmodel(RandomForestRegressor, X_train, y_train, X_test, y_test)

param_grid = [
                {
                    "max_depth" : [80, 90, 100, None],
                    "n_estimators" : [20, 50, 100],
                    "max_features": ["auto", "sqrt"],
                    "criterion" : ["squared_error", "absolute_error", "poisson"]
                }
            ]

grid_rfr2 = trainmodelGSCV(model_rfr2, X_train, y_train, param_grid)

# plot_rfr2 = plot_validation(grid_rfr2, X_train, y_train, X_test, y_test, TARGET2, "RandomForestRegressor", COLORS[2], COLORS[6])

Score du jeu TRAIN
MAE:  0.4974453156489947
RMSE:  1.216232562827412
Median abs err:  0.23399999999999777
R2:  0.9843192936383711

Score du jeu TEST
MAE:  1.3935930398574263
RMSE:  2.7390091731233968
Median abs err:  0.6780999999999979
R2:  0.90912491223856

Score après GridSearchCV
CV Mean:  0.8915990272710401
STD:  0.052025893717158385
Le meilleur score est de: 0.8836948043618496 avec {'criterion': 'poisson', 'max_depth': 90, 'max_features': 'sqrt', 'n_estimators': 100}
