In [33]:
!pip install pycaret



In [34]:
import pandas as pd

In [35]:
# import our data

data = pd.read_csv('/content/preprocessed_data.csv')
data.head()

Unnamed: 0,sale_year,sale_month,sale_day,Holiday,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,2006,11,16,False,1139246,66000,999089,3157,121,3.0,...,1,1,1,1,1,1,1,1,1,1
1,2004,3,26,False,1139248,57000,117657,77,121,3.0,...,1,1,1,1,1,1,1,1,1,1
2,2004,2,26,False,1139249,10000,434808,7009,121,3.0,...,0,0,0,0,0,0,0,0,0,0
3,2011,5,19,False,1139251,38500,1026470,332,121,3.0,...,0,0,0,0,0,0,0,0,0,0
4,2009,7,23,False,1139253,11000,1057373,17311,121,3.0,...,2,2,2,2,2,2,2,2,2,2


In [36]:
data['SalePrice'].describe()

count    401125.000000
mean      31099.712848
std       23036.898502
min        4750.000000
25%       14500.000000
50%       24000.000000
75%       40000.000000
max      142000.000000
Name: SalePrice, dtype: float64

# sampling the data

In [37]:
# make groups of my SalePrice columns for stratified my data on it

bins = [4750 ,14500, 24000, 40000, 142000]

# make new columns
data['groups']  = pd.cut(data['SalePrice'], bins = bins)

# see what happen
data['groups'].head()

0    (40000, 142000]
1    (40000, 142000]
2      (4750, 14500]
3     (24000, 40000]
4      (4750, 14500]
Name: groups, dtype: category
Categories (4, interval[int64, right]): [(4750, 14500] < (14500, 24000] < (24000, 40000] <
                                         (40000, 142000]]

In [38]:
# let's take a sample from our main data
# get weights from the groups
weights = data['groups'].value_counts()
# sample our data
sample_data = data.sample(frac = .15, weights=weights)
# see the distepution
sample_data['groups'].value_counts()
# drop groups columns from main data
data.drop('groups', axis = 1, inplace = True)


In [39]:
sample_data.drop('groups', axis = 1, inplace = True)

In [40]:
sample_data =  sample_data.dropna()
sample_data.isnull().sum()

sale_year                   0
sale_month                  0
sale_day                    0
Holiday                     0
SalesID                     0
SalePrice                   0
MachineID                   0
ModelID                     0
datasource                  0
auctioneerID                0
YearMade                    0
MachineHoursCurrentMeter    0
UsageBand                   0
fiModelDesc                 0
fiBaseModel                 0
fiSecondaryDesc             0
fiModelSeries               0
fiModelDescriptor           0
ProductSize                 0
fiProductClassDesc          0
state                       0
ProductGroup                0
ProductGroupDesc            0
Drive_System                0
Enclosure                   0
Forks                       0
Pad_Type                    0
Ride_Control                0
Stick                       0
Transmission                0
Turbocharged                0
Blade_Extension             0
Blade_Width                 0
Enclosure_

# split data into target and features and then split it into train and test data

In [41]:
# split data into target and features
target = sample_data['SalePrice']
features = sample_data.drop('SalePrice', axis = 1)

In [42]:
# import function for split
from sklearn.model_selection import train_test_split
# split the data by ratio .2
x_trian, x_valid, y_trian, y_valid = train_test_split(features,target, test_size=.2, shuffle=True, random_state = 100)


# try the most of ML regression models and take the best from them

In [31]:
#init setup
from pycaret.regression import *
clf1 = setup(data = sample_data, target = 'SalePrice')

# compare models
best = compare_models(sort = 'RMSLE')

Unnamed: 0,Description,Value
0,Session id,1106
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(60153, 56)"
4,Transformed data shape,"(60153, 56)"
5,Transformed train set shape,"(42107, 56)"
6,Transformed test set shape,"(18046, 56)"
7,Numeric features,54
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,5990.0456,88228466.0325,9389.7652,0.8377,0.2736,0.2148,27.257
xgboost,Extreme Gradient Boosting,6071.387,82663465.4963,9090.1542,0.8479,0.2793,0.2186,9.757
et,Extra Trees Regressor,6264.9743,96457821.313,9818.4553,0.8226,0.2866,0.2287,21.275
lightgbm,Light Gradient Boosting Machine,6917.1366,106223389.148,10304.1465,0.8046,0.3097,0.2579,1.93
dt,Decision Tree Regressor,8122.767,171453534.2593,13090.7681,0.6845,0.3692,0.2781,0.696
gbr,Gradient Boosting Regressor,10069.1802,203726786.8474,14270.9393,0.6252,0.4186,0.3896,7.546
knn,K Neighbors Regressor,14306.1102,414903187.7325,20368.1576,0.2367,0.5647,0.5424,1.073
huber,Huber Regressor,16088.3258,554774737.2738,23552.9015,-0.0206,0.6273,0.5379,0.503
lasso,Lasso Regression,16820.3974,493381705.94,22211.8292,0.0923,0.6455,0.7028,0.463
ridge,Ridge Regression,16820.3547,493381167.443,22211.8173,0.0923,0.6455,0.7028,0.189


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

we take the **`RandomForest` Regressor and `XGboost`** and fine tune it.

# fine tune the models we select with gridsearch

In [44]:
# import the function we need it
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import 	XGBRegressor


fine tune random forest regressor

In [47]:
# init the regressor

rf = RandomForestRegressor()

# define the parameters

parameters = {'n_estimators': [100, 150, 200, 250, 300],
              'max_depth': [1,2,3,4]}

# define grid search
clf = GridSearchCV(rf, parameters)

# fit grid search
clf.fit(x_trian, y_trian)

In [49]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
def RMSLE(y_true, y_pred):
      return np.sqrt(mean_squared_log_error(y_true, y_pred))
# make prediction and see the evaluation metric
y_pred = clf.predict(x_trian)
# see evaluation metrics
print(RMSLE(y_trian, y_pred))

0.5447998440362296
