In [17]:
# Import all the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
used_cars_data = pd.read_csv("final_master_data.csv")

In [3]:
used_cars_data.head()

Unnamed: 0,State_ AK,State_ AL,State_ AR,State_ AZ,State_ CA,State_ CO,State_ CT,State_ DC,State_ DE,State_ FL,...,City_imporatnce_Medium,State_imporatnce_High,State_imporatnce_Low,State_imporatnce_Medium,Price,Year,Mileage,City_mean_price,Brand_popularity,Model_level
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,16472,2015,18681,18785.571605,2,2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,15749,2015,27592,18233.158996,2,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,16998,2015,13650,22329.498,2,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,15777,2015,25195,19024.911538,2,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,16784,2015,22800,18866.277499,2,2



The data is already cleaned and done with feature engineering

In [4]:
used_cars_data.dtypes

State_ AK           float64
State_ AL           float64
State_ AR           float64
State_ AZ           float64
State_ CA           float64
                     ...   
Year                  int64
Mileage               int64
City_mean_price     float64
Brand_popularity      int64
Model_level           int64
Length: 66, dtype: object

In [1]:
 #Create evaluation function
from sklearn.metrics import r2_score

# Create function to evalyuate models on a few diffferent levels
def show_score(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {
              "Training R^2" : r2_score(y_train, train_preds),
             "Test R^2" : r2_score(y_test, test_preds)}
    return scores

A pipline consists of steps which contains a list of tuples

* Steps we want to do (all in one cell):
    !. Build a model on data

In [2]:
# Getting data ready
import pandas as pd
from sklearn.pipeline import Pipeline

# Modelling
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

# Import data
used_cars_data = pd.read_csv("final_master_data.csv")

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps = [("model", lgb.LGBMRegressor())])

# Split data
X = used_cars_data.drop("Price", axis =1)
y = used_cars_data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6779565148155783

In [None]:
# # Now let's hyperparameter tune our model
# # Use RandomizedSearchCV with our regression Pipeline
# from sklearn.model_selection import RandomizedSearchCV
# pipe_grid = {
#             'model__n_estimators': [100,200,500,750,1000,2000],
#              'model__num_leaves': [40,45,50,55,60,65,70,75,80], 
#              'model__max_depth':[2,3,4,5,6,7,8,10] ,
# #              'model__min_child_samples': [100, 90,200,300,60], 
# #              'model__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#              'model__subsample': [1.0,1,1,1.2,1.8,1.9], 
# #              'model__colsample_bytree': [0.2,0.4, 0.6, 0.9, 1],
# #              'model__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
# #              'model__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
# #              "model__learning_rate": [0.1, 0.01, 0.5,1,0.05],
#              "model__objective": ['regression'],
#              'model__boosting': ['gbdt','dart']
# }
# gs_model = GridSearchCV(model,
#                           param_grid= pipe_grid,
#                               cv=5,
#                               verbose=True)
# gs_model.fit(X_train, y_train)

In [3]:
# After hyperparameter tuning

ideal_model_full = lgb.LGBMRegressor(subsample =1,
                                      reg_lambda =  50,
                                      reg_alpha = 10,
                                      objective = 'regression',
                                      num_leaves = 45,
                                      n_estimators = 100,
                                      min_child_weight = 0.001,
                                      min_child_samples = 300,
                                      max_depth = 7,
                                      learning_rate = 1,
                                      colsample_bytree = 0.4,
                                      boosting = 'dart')
ideal_model_full.fit(X_train, y_train)
ideal_model_full.score(X_test, y_test)



0.679004156824824

In [4]:
show_score(ideal_model_full)

{'Training R^2': 0.6814927207509416, 'Test R^2': 0.679004156824824}

In [5]:
# Saving our model using pickle

import pickle 

# Save an existing model to file
pickle.dump(ideal_model_full, open("lightGBM_full_data.pkl", "wb"))



In [6]:
# Load a saved model
#loaded_pickle_model = pickle.load(open("lightGBM_full_data.pkl", "rb"))

## Split the data into the new data (After 2006) and old data (before 2006)

In [4]:
# Cars that were manufactured before 2006

old_data = used_cars_data[used_cars_data["Year"] < 2006]

# Cars that were manufactured after 2006
new_data = used_cars_data[used_cars_data["Year"]>=2006]

In [5]:
old_data.shape, new_data.shape

((31892, 66), (1141956, 66))

## Predictions on new data

In [14]:
# Let's create a pipeline
# Getting data ready
import pandas as pd
from sklearn.pipeline import Pipeline

# Modelling
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps = [("model", lgb.LGBMRegressor())])

# Split data
X = new_data.drop("Price", axis =1)
y = new_data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.668713262736466

In [16]:
show_score(model)

{'Training R^2': 0.669140255450589, 'Test R^2': 0.668713262736466}

In [18]:
# make predictions using hyperparameters
ideal_model_new = lgb.LGBMRegressor(subsample =1,
                                      reg_lambda =  50,
                                      reg_alpha = 10,
                                      objective = 'regression',
                                      num_leaves = 45,
                                      n_estimators = 100,
                                      min_child_weight = 0.001,
                                      min_child_samples = 300,
                                      max_depth = 7,
                                      learning_rate = 1,
                                      colsample_bytree = 0.4,
                                      boosting = 'dart')
ideal_model_new.fit(X_train, y_train)
ideal_model_new.score(X_test, y_test)



0.6696256701138095

In [19]:
# Saving our model using pickle

import pickle 

# Save an existing model to file
pickle.dump(ideal_model_new, open("lightGBM_new_data.pkl", "wb"))




## Predictions on old data

In [19]:
# Let's create a pipeline for old data
# Getting data ready
import pandas as pd
from sklearn.pipeline import Pipeline

# Modelling
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed 
import numpy as np
np.random.seed(42)

# Creating a modelling pipeline
model = Pipeline(steps = [("model", lgb.LGBMRegressor())])

# Split the data
X = old_data.drop("Price", axis = 1)
y = old_data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Fit the model and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6095045414773557

Low score on old data in comparison with new data.

In [7]:
show_score(model)

{'Training R^2': 0.6345118119563075, 'Test R^2': 0.6095045414773557}

In [47]:
# make predictions using hyperparameters
ideal_model_old = lgb.LGBMRegressor(subsample =1,
                                      reg_lambda =  70,
                                      reg_alpha = 0,
                                      objective = 'regression',
                                      num_leaves =32,
                                      n_estimators = 100,
                                      min_child_weight = 0.01,
                                      min_child_samples = 10,
                                      max_depth = 7,
                                      learning_rate = 1,
                                      colsample_bytree = 0.4,
                                      boosting = 'dart',
                              )
ideal_model_old.fit(X_train, y_train)
ideal_model_old.score(X_test, y_test)



0.5953156777803308

In [48]:
# Saving our model using pickle

import pickle 

# Save an existing model to file
pickle.dump(model, open("lightGBM_old_data_with_hyperparameter.pkl", "wb"))


