In [1]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import numpy as np
import math

In [2]:
PATH = 'bluebook/'

In [3]:
df = pd.read_csv(f'{PATH}Train.csv',low_memory=False,parse_dates=['saledate'])
df

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,2004-03-26,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2004-02-26,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,2011-05-19,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,2009-07-23,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,6333336,10500,1840702,21439,149,1.0,2005,,,2011-11-02,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
401121,6333337,11000,1830472,21439,149,1.0,2005,,,2011-11-02,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
401122,6333338,11500,1887659,21439,149,1.0,2005,,,2011-11-02,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
401123,6333341,9000,1903570,21435,149,2.0,2005,,,2011-10-25,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,


In [4]:
def apply_cats(df,train):
    for n,c in df.items():
        if train[n].datatype == 'category':
            df[n] = pd.Categorical(c,categories=train[n].cat.categories,ordered=True)
    

In [5]:
def train_cats(df):
    for n, c in df.items():
        if is_string_dtype(c):
            df[n]=c.astype('category').cat.as_ordered()

In [6]:
def add_datepart(df,dt_name,drop=True,time=False):
    dt_column=df[dt_name]
    column_dtype=dt_column.dtype
    attr=['year','month','week','day']
    for a in attr:
        df['Date' + a.capitalize()] = getattr(dt_column.dt,a)
    df['Date' + 'Elapsed']=dt_column.astype(np.int64)//10**9
    if drop:
        df.drop(dt_name,axis=1,inplace=True)

In [7]:
def numericalize(df,col,name):
    if not is_numeric_dtype(col):
        df[name]=col.cat.codes +1

In [8]:
def fix_missing(df,col,name,nan_dict,is_train):
    if is_train:
        if is_numeric_dtype(col):
            if pd.isnull(col).any():
                df[name+'_na']=pd.isnull(col)
                nan_dict[name] = col.median()
                df[name] = col.fillna(nan_dict[name])
    else:
        if is_numeric_dtype(col):
            if name in nan_dict:
                df[name+'_na']=pd.isnull(col)
                df[name] = col.fillna(nan_dict[name])
            else:
                df[name] = col.fillna(df[name].median())


In [9]:
def proc_df(df,y_fld,nan_dict=None,is_train=True):
    df = df.copy()
    y=df[y_fld].values
    df.drop(y_fld,axis=1,inplace=True)
    
    if nan_dict is None:
        nan_dict={}
    for n,c in df.items():
        fix_missing(df,c,n,nan_dict,is_train)
        numericalize(df,c,n)
    if is_train:
        return df,y,nan_dict
    return df,y


In [10]:
def split_train_val(df,n):
    return df[:n].copy(), df[n:].copy()

In [29]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [30]:
def print_score(rfr):
    print(f'RMSE of train set {rmse(rfr.predict(x_train),y_train)}')
    print(f'RMSE of validation set {rmse(rfr.predict(x_valid),y_valid)}')
    print(f'R^2 of train set {rfr.score(x_train,y_train)}')
    print(f'R^2 of validtaion set {rfr.score(x_valid,y_valid)}')
    

In [35]:
def get_samples(df,n):
    idxs=np.random.permutation(len(df))[:n]
    return idxs,df.iloc[idxs].copy()


# RMSLE

In [11]:
df['SalePrice'] = np.log(df.SalePrice)


# Transforming Categoric Values 

In [12]:
df['UsageBand']

0            Low
1            Low
2           High
3           High
4         Medium
           ...  
401120       NaN
401121       NaN
401122       NaN
401123       NaN
401124       NaN
Name: UsageBand, Length: 401125, dtype: object

In [13]:
train_cats(df)

In [14]:
df['UsageBand']

0            Low
1            Low
2           High
3           High
4         Medium
           ...  
401120       NaN
401121       NaN
401122       NaN
401123       NaN
401124       NaN
Name: UsageBand, Length: 401125, dtype: category
Categories (3, object): ['High' < 'Low' < 'Medium']

In [15]:
df['UsageBand'].cat.set_categories(['High','Medium','Low'],ordered=True,inplace=True)
df['UsageBand'].cat.categories

Index(['High', 'Medium', 'Low'], dtype='object')

In [16]:
numericalize(df,df['UsageBand'],'UsageBand')

In [17]:
df['UsageBand']

0         3
1         3
2         1
3         1
4         2
         ..
401120    0
401121    0
401122    0
401123    0
401124    0
Name: UsageBand, Length: 401125, dtype: int8

# Datetime column 

In [18]:
df['saledate']

0        2006-11-16
1        2004-03-26
2        2004-02-26
3        2011-05-19
4        2009-07-23
            ...    
401120   2011-11-02
401121   2011-11-02
401122   2011-11-02
401123   2011-10-25
401124   2011-10-25
Name: saledate, Length: 401125, dtype: datetime64[ns]

In [19]:
df['saledate'].dt.year

0         2006
1         2004
2         2004
3         2011
4         2009
          ... 
401120    2011
401121    2011
401122    2011
401123    2011
401124    2011
Name: saledate, Length: 401125, dtype: int64

In [20]:
add_datepart(df,'saledate')
df

  df['Date' + a.capitalize()] = getattr(dt_column.dt,a)


Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,DateYear,DateMonth,DateWeek,DateDay,DateElapsed
0,1139246,11.097410,999089,3157,121,3.0,2004,68.0,3,521D,...,,,,Standard,Conventional,2006,11,46,16,1163635200
1,1139248,10.950807,117657,77,121,3.0,1996,4640.0,3,950FII,...,,,,Standard,Conventional,2004,3,13,26,1080259200
2,1139249,9.210340,434808,7009,121,3.0,2001,2838.0,1,226,...,,,,,,2004,2,9,26,1077753600
3,1139251,10.558414,1026470,332,121,3.0,2001,3486.0,1,PC120-6E,...,,,,,,2011,5,20,19,1305763200
4,1139253,9.305651,1057373,17311,121,3.0,2007,722.0,2,S175,...,,,,,,2009,7,30,23,1248307200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,6333336,9.259131,1840702,21439,149,1.0,2005,,0,35NX2,...,,,,,,2011,11,44,2,1320192000
401121,6333337,9.305651,1830472,21439,149,1.0,2005,,0,35NX2,...,,,,,,2011,11,44,2,1320192000
401122,6333338,9.350102,1887659,21439,149,1.0,2005,,0,35NX2,...,,,,,,2011,11,44,2,1320192000
401123,6333341,9.104980,1903570,21435,149,2.0,2005,,0,30NX,...,,,,,,2011,10,43,25,1319500800


In [21]:
for n, c in df.items():
    if is_numeric_dtype(c):
        if df[n].isnull().sum():
            print(n)

auctioneerID
MachineHoursCurrentMeter


In [22]:
df['MachineHoursCurrentMeter'].isnull().sum()

258360

In [23]:
n_valid = 12000
n_train = len(df)-n_valid
raw_train,raw_valid = split_train_val(df,n_train)

In [39]:
x_train, y_train, nas = proc_df(raw_train,'SalePrice')

In [40]:
x_valid, y_valid = proc_df(raw_valid,'SalePrice',nan_dict=nas,is_train=False)


# Feather Format

In [26]:
import os 
os.makedirs('tmp',exist_ok=True)
df.to_feather('tmp/bulldozers_1')


# First Model

In [42]:
rfr = RandomForestRegressor(n_estimators=1,bootstrap=False,n_jobs=-1)
rfr.fit(x_train,y_train)
print_score(rfr)

RMSE of train set 9.547041298505971e-17
RMSE of validation set 0.35318726989128113
R^2 of train set 1.0
R^2 of validtaion set 0.7772293062296964



# Second model
### With bootstrap and more estimators

In [43]:
rfr = RandomForestRegressor(n_estimators=50,n_jobs=-1)
rfr.fit(x_train,y_train)
print_score(rfr)

RMSE of train set 0.077580200310398
RMSE of validation set 0.23885466399194233
R^2 of train set 0.987421275094115
R^2 of validtaion set 0.8981137856874353



# Working with subsamples

In [44]:
idxs, x_train = get_samples(x_train,3000)
y_train = y_train[idxs]

In [45]:
rfr = RandomForestRegressor(n_estimators=50,n_jobs=-1)
rfr.fit(x_train,y_train)
print_score(rfr)

RMSE of train set 0.1200915147116896
RMSE of validation set 0.3256740382698494
R^2 of train set 0.9699628202576899
R^2 of validtaion set 0.8105850504552681


# Random Feature selection model

In [41]:
rfr = RandomForestRegressor(n_estimators=50,n_jobs=-1,min_samples_leaf=5,max_features=0.5,oob_score=True)
rfr.fit(x_train,y_train)
print_score(rfr)

RMSE of train set 0.14550207143341146
RMSE of validation set 0.22917504594259974
R^2 of train set 0.9557540852283523
R^2 of validtaion set 0.9062043649523164


Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,fiBaseModel,...,Travel_Controls,Differential_Type,Steering_Controls,DateYear,DateMonth,DateWeek,DateDay,DateElapsed,auctioneerID_na,MachineHoursCurrentMeter_na
283928,1896527,1550891,6054,132,2.0,1994,0.0,0,1827,582,...,0,0,0,2001,11,46,17,1005955200,False,True
336793,2350252,1715086,4605,136,1.0,2004,1488.0,3,493,175,...,0,0,0,2009,3,12,19,1237420800,False,False
74769,1362538,1390343,7257,132,6.0,1980,0.0,0,1677,508,...,0,4,2,1996,6,25,18,835056000,False,True
196297,1634250,1199278,4806,132,2.0,1999,0.0,0,1370,403,...,6,0,0,2005,1,4,30,1107043200,False,True
87578,1391895,1094734,7008,132,1.0,1980,0.0,0,317,108,...,0,0,0,1994,3,12,22,764294400,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223182,1698366,1161755,15608,132,1.0,1998,0.0,0,4111,1462,...,0,0,0,2002,5,21,22,1022025600,False,True
142219,1508052,1085804,7464,132,4.0,1975,0.0,0,2163,696,...,6,0,0,2000,2,6,9,950054400,False,True
13187,1182487,1027653,10468,121,3.0,2005,1457.0,2,2496,775,...,0,0,0,2008,9,36,4,1220486400,False,False
170829,1598820,1082492,8202,132,11.0,1980,0.0,0,965,305,...,0,4,2,1993,11,45,11,752976000,False,True
