# GradientBoostingRegressor

This project trains a model with GradientBoostRegressor to predict second hand car prices.

In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Reading Data
Reading CSV file as a Pandas DataFrame.

In [2]:
df_original = pd.read_csv("datasets/cars.csv")

# Preprocessing

Removing the Car_ID column.

In [3]:
df = df_original.drop("Car_ID", axis=1)

In [4]:
df.head()

Unnamed: 0,Brand,Model,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Toyota,Corolla,2018,50000,Petrol,Manual,First,15,1498,108,5,800000
1,Honda,Civic,2019,40000,Petrol,Automatic,Second,17,1597,140,5,1000000
2,Ford,Mustang,2017,20000,Petrol,Automatic,First,10,4951,395,4,2500000
3,Maruti,Swift,2020,30000,Diesel,Manual,Third,23,1248,74,5,600000
4,Hyundai,Sonata,2016,60000,Diesel,Automatic,Second,18,1999,194,5,850000


Using LabelEncoder to transform non-numeric data to indiced numbers.

In [5]:
le_brand = LabelEncoder()
le_brand.fit(df["Brand"])
df["Brand"] = le_brand.transform(df["Brand"])

le_model = LabelEncoder()
le_model.fit(df["Model"])
df["Model"] = le_model.transform(df["Model"])

le_fuel_type = LabelEncoder()
le_fuel_type.fit(df["Fuel_Type"])
df["Fuel_Type"] = le_fuel_type.transform(df["Fuel_Type"])

le_transmission = LabelEncoder()
le_transmission.fit(df["Transmission"])
df["Transmission"] = le_transmission.transform(df["Transmission"])

le_owner_type = LabelEncoder()
le_owner_type.fit(df["Owner_Type"])
df["Owner_Type"] = le_owner_type.transform(df["Owner_Type"])

In [6]:
df.head()

Unnamed: 0,Brand,Model,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,9,15,2018,50000,1,1,0,15,1498,108,5,800000
1,3,14,2019,40000,1,0,1,17,1597,140,5,1000000
2,2,30,2017,20000,1,0,0,10,4951,395,4,2500000
3,6,42,2020,30000,0,1,2,23,1248,74,5,600000
4,4,41,2016,60000,0,0,1,18,1999,194,5,850000


# Input and Output
X is input column(s), and y is the output column.

In [7]:
X = df.drop("Price", axis=1)
y = pd.DataFrame(df["Price"])

## Splitting Data
Using 67% of rows to train model, and the remaining 33% will be used for testing model's performance.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

# Training Model

In [9]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train.values.ravel())

## Saving Model
Model is saving as a file. So you can use it in any time when you need it. 

In [10]:
joblib.dump(gbr, "trained_models/cars_gradient_boosting_regressor.pkl")

['trained_models/cars_gradient_boosting_regressor.pkl']

# Measuring Model Performance

Model performance percentage on train data.

In [11]:
gbr.score(X_train, y_train) * 100

99.98279387392036

Model performance percentage on test data.

In [12]:
gbr.score(X_test, y_test) * 100

89.42671028371507

# Using Model
Using model to predict prices in test rows and the result redirected to y_pred variable. 

In [13]:
y_pred = gbr.predict(X_test)

# Example Code for API

Please note that this example used LabelEncoder to transform string datas to indexed numbers. In most scenarios columns are stored in label encoded format in database.

In [14]:
def api_predict(brand, model, year, km, fuel_type, transmission, 
                owner_type, mileage, engine, power, seats):
    
    df = pd.DataFrame(
        {
            "Brand": le_brand.transform([brand]),
            "Model": le_model.transform([model]),
            "Year": [year],
            "Kilometers_Driven": [km],
            "Fuel_Type": le_fuel_type.transform([fuel_type]),
            "Transmission": le_transmission.transform([transmission]),
            "Owner_Type": le_owner_type.transform([owner_type]),
            "Mileage": [mileage],
            "Engine": [engine],
            "Power": [power],
            "Seats": [seats]        
        }
    )
    
    model = joblib.load("trained_models/cars_gradient_boosting_regressor.pkl")
    return model.predict(df).round()

# A Representative API Call

In [15]:
api_predict(
    brand="Toyota",
    model="Corolla",
    year=2018,
    km=50000,
    fuel_type="Petrol",
    transmission="Manual",
    owner_type="First",
    mileage=15,
    engine=1498,
    power=108,
    seats=5
)

array([791563.])

In [16]:
df_original.iloc[0]

Car_ID                     1
Brand                 Toyota
Model                Corolla
Year                    2018
Kilometers_Driven      50000
Fuel_Type             Petrol
Transmission          Manual
Owner_Type             First
Mileage                   15
Engine                  1498
Power                    108
Seats                      5
Price                 800000
Name: 0, dtype: object