Note : in the imports.core.py file, unhighlight # 29-31, 24 and download fastprogress

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *



In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

sns.set_style('darkgrid')

In [4]:
file = '../data/bluebook-for-bulldozers/Train.csv'

In [5]:
df_raw = pd.read_csv(file, parse_dates=['saledate'], low_memory=False)

## A glance through at the properties of the data set.

In [6]:
print('Feature values of the first five samples in the data set :')
df_raw.head()

Feature values of the first five samples in the data set :


Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,2004-03-26,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2004-02-26,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,2011-05-19,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,2009-07-23,...,,,,,,,,,,


In [7]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   SalesID                   401125 non-null  int64         
 1   SalePrice                 401125 non-null  int64         
 2   MachineID                 401125 non-null  int64         
 3   ModelID                   401125 non-null  int64         
 4   datasource                401125 non-null  int64         
 5   auctioneerID              380989 non-null  float64       
 6   YearMade                  401125 non-null  int64         
 7   MachineHoursCurrentMeter  142765 non-null  float64       
 8   UsageBand                 69639 non-null   object        
 9   saledate                  401125 non-null  datetime64[ns]
 10  fiModelDesc               401125 non-null  object        
 11  fiBaseModel               401125 non-null  object        
 12  fi

In [8]:
print(f'The data set contain {df_raw.shape[0]} samples and {df_raw.shape[1]} features')

The data set contain 401125 samples and 53 features


In [9]:
print(f'Here is a list of the features present in the data set {df_raw.columns}')

Here is a list of the features present in the data set Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'saledate', 'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc',
       'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls'],
      dtype='object')

In [10]:
print(f'Description of the data set')
df_raw.describe()

Description of the data set


Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter
count,401125.0,401125.0,401125.0,401125.0,401125.0,380989.0,401125.0,142765.0
mean,1919713.0,31099.712848,1217903.0,6889.70298,134.66581,6.55604,1899.156901,3457.955
std,909021.5,23036.898502,440992.0,6221.777842,8.962237,16.976779,291.797469,27590.26
min,1139246.0,4750.0,0.0,28.0,121.0,0.0,1000.0,0.0
25%,1418371.0,14500.0,1088697.0,3259.0,132.0,1.0,1985.0,0.0
50%,1639422.0,24000.0,1279490.0,4604.0,132.0,2.0,1995.0,0.0
75%,2242707.0,40000.0,1468067.0,8724.0,136.0,4.0,2000.0,3025.0
max,6333342.0,142000.0,2486330.0,37198.0,172.0,99.0,2013.0,2483300.0


## Preparing the data for modelling

In [11]:
df_raw['SalePrice'] = np.log(df_raw['SalePrice'])

The precesseding code splits the data set into the features and target values, where the feature value contains the predictor values and the target holds the target's value (price).

In [12]:
X, y = df_raw.drop('SalePrice', axis=1), df_raw['SalePrice'].copy()
X.shape, y.shape

((401125, 52), (401125,))

Due to computational constriants the model would only run on 10000 samples of the whole data set.

In [13]:
sample_size = 20000 # using only 10000 sample points for this exercise, computational expenses

### First model without preprocessing the data

In [14]:
rf = RandomForestRegressor(max_samples=sample_size).fit(X, y)

ValueError: could not convert string to float: 'Low'

# All These are my Practice Codes

In [None]:
# # extracting features fromthe date attribute
# def add_date(df, name):
#     fld = df[name]
#     target = re.sub('[Dd]ate$', '', name)
#     for n in ['Year', 'Month', 'Day']:
#         df[target+n] = getattr(fld.dt, n.lower())
#     df.drop(name, axis=1, inplace=True)
# add_date(df_raw, 'saledate')

In [None]:
# df_raw

In [None]:
# df_raw['UsageBand'] = pd.Categorical(df_raw['UsageBand']).codes+1
# df_raw['UsageBand']

In [None]:
# df_raw.isnull().sum()

In [None]:
# # Changing object to categorical data types
# for column, content in df_raw.items():
#     if not pd.api.types.is_numeric_dtype(content):
#         df_raw[column] = content.astype('category').cat.as_ordered()

In [None]:
# # dealing with null values and filling the the values of missing numeric type with their median value       
# for column, content in df_raw.items():
#     if pd.api.types.is_numeric_dtype(content):
#         if pd.isnull(content).sum():                         
#             df_raw[column+'_na'] = pd.isnull(content)
#             df_raw[column] = content.fillna(content.median())

In [None]:
# df_raw

In [None]:
# # Dealing with categorical values
# for column, content in df_raw.items():
#     if not pd.api.types.is_numeric_dtype(content):
#         if pd.isnull(content).sum():                                     
#             df_raw[column+'_na'] = pd.isnull(content)
#             df_raw[column] = pd.Categorical(content).codes+1

In [None]:
# df_raw

In [None]:
# display(df_raw.iloc[4:, 9::2].tail(20).T)

In [None]:
# df_raw.info()

In [None]:
# X, y = df_raw.drop('SalePrice', axis=1), df_raw['SalePrice'].copy()

In [None]:
# rf = RandomForestRegressor(max_samples=10000, n_jobs=-1).fit(X, y)
# rf.score(X, y)

In [None]:
#  Splitting the data into train and validation sets

In [None]:
# test_size = 12000
# train_size = len(X) - test_size

# def split_data(df, size):
#     #splits data in train and validation sets
#     return df[:size].copy(), df[size:].copy()

In [None]:
# X_train, X_valid = split_data(X, train_size)
# y_train, y_test = split_data(y, train_size)

In [None]:
# X_train.shape, X_valid.shape, y_train.shape

In [None]:
# rf= RandomForestRegressor(max_samples=20000, n_jobs=-1).fit(X_train, y_train)
# rf.score(X_train, y_train)

In [None]:
# X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.30)

In [None]:
# rf= RandomForestRegressor(max_samples=10000, n_jobs=-1).fit(X_trn, y_trn)
# rf.score(X_trn, y_trn)

# End's Here

In [None]:
df_raw['saledate']

In [None]:
# turning the saledate in to unique datetime points
add_datepart(df_raw, 'saledate')

In [None]:
df_raw

In [None]:
# dealing with categorical variable
train_cats(df_raw)
df_raw

In [None]:
df_raw['UsageBand'].cat.set_categories(['High', "Medium", 'Low'], inplace=True)

In [None]:
df_raw['UsageBand'].cat.codes

In [None]:
print(f'After preprocessing out data set, it now contains {df_raw.shape[0]} samples and {df_raw.shape[1]} features')

In [None]:
# saving the file to feather-format
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/df_raw1')

In [None]:
df_raw = pd.read_feather('tmp/df_raw1')

In [None]:
df_raw

In [None]:
df, y, _ = proc_df(df_raw, 'SalePrice', subset=sample_size)
df.shape

## Building a second model

In [None]:
# Making a model
rf = RandomForestRegressor().fit(df, y)
print(f'Train score(R^2) : {round(rf.score(df, y), 3)}')

### splitting into train and validation sets

In [None]:
# splitting the data
n_size = 2000
trn_size = len(df) - n_size

def split(df, n):
    return df[:n], df[n:]

X_train, X_valid = split(df, trn_size)
y_train, y_valid = split(y, trn_size)

X_train.shape, X_valid.shape, y_train.shape


In [None]:
def rmse(predictions, actuals):
    '''returns the root mean squared error
    of the true value and predicted values
    
    --------------
    predictions : predicted values .i.e model.predict(X)
    actuals : actual value of the true target variable (y)
    '''
    return np.sqrt(mean_squared_error(predictions, actuals))

def print_score(model):
    '''returns a set of scores from the model. If the oob_score hyperparameter is given it appends it's score 
    '''
    res = [rmse(model.predict(X_train), y_train), rmse(model.predict(X_valid), y_valid), 
           model.score(X_train, y_train), model.score(X_valid, y_valid)]
    if hasattr(model, 'oob_score_'):
        res.append(model.oob_score_)
        print(f'OOB score of the model : {res[4]}')
    print(f'RMSE of the train set : {res[0]}')
    print(f'RMSE of the validation set : {res[1]}')
    print(f'R^2 score of the train set : {res[2]}')
    print(f'R^2 score of the validation set : {res[3]}')

In [None]:
rf = RandomForestRegressor(n_jobs=-1).fit(X_train, y_train)
print_score(rf)

From this model we get a RMSE score for the validation set which would place us in the top 29% in the kaggle public leader board. But the R^2 score of the validation is much lower than that of the training set, which would suggest that the data is overfitting.

#  Building a single tree

In [None]:
?? draw_tree

In [None]:
rf = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1).fit(X_train, y_train)
print_score(rf)

In [None]:
draw_tree(rf.estimators_[0], df, precision=3, ratio=0.9)