In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_raw = pd.read_csv("data/bulldozer/train.csv", low_memory=False, parse_dates=["saledate"])

In [4]:
df_raw.head(1)

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional


In [5]:
from utils.explore import *

### Display All

In [29]:
display_all(df_raw.head(5).T)

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,11.0974,10.9508,9.21034,10.5584,9.30565
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3,3,3,3,3
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68,4640,2838,3486,722
UsageBand,Low,Low,High,High,Medium
fiModelDesc,521D,950FII,226,PC120-6E,S175


### Take log of target variable

In [7]:
df_raw["SalePrice"] = np.log(df_raw["SalePrice"])

### Convert date to dateparts

In [8]:
from fastailite.structured import *

View columns containing the substring "date"

In [9]:
df_raw.columns[df_raw.columns.str.contains(r".*[Dd]ate.*")]

Index([u'saledate'], dtype='object')

In [10]:
add_datepart(df_raw, "saledate")

View new columns

In [11]:
df_raw[df_raw.columns[df_raw.columns.str.contains(r".*sale.*")]].head(1).T

Unnamed: 0,0
saleYear,2006
saleMonth,11
saleWeek,46
saleDay,16
saleDayofweek,3
saleDayofyear,320
saleIs_month_end,False
saleIs_month_start,False
saleIs_quarter_end,False
saleIs_quarter_start,False


### Convert string objects to categorical variables

View columns consisting of string objects

In [21]:
cat_cols = df_raw.columns[df_raw.dtypes == "object"]

In [30]:
df_raw[cat_cols].tail(5).T

Unnamed: 0,401120,401121,401122,401123,401124
UsageBand,,,,,
fiModelDesc,35NX2,35NX2,35NX2,30NX,30NX
fiBaseModel,35,35,35,30,30
fiSecondaryDesc,NX,NX,NX,NX,NX
fiModelSeries,2,2,2,,
fiModelDescriptor,,,,,
ProductSize,Mini,Mini,Mini,Mini,Mini
fiProductClassDesc,"Hydraulic Excavator, Track - 3.0 to 4.0 Metric...","Hydraulic Excavator, Track - 3.0 to 4.0 Metric...","Hydraulic Excavator, Track - 3.0 to 4.0 Metric...","Hydraulic Excavator, Track - 2.0 to 3.0 Metric...","Hydraulic Excavator, Track - 2.0 to 3.0 Metric..."
state,Maryland,Maryland,Maryland,Florida,Florida
ProductGroup,TEX,TEX,TEX,TEX,TEX


In [35]:
df_raw.UsageBand.value_counts()

Medium    33985
Low       23620
High      12034
Name: UsageBand, dtype: int64

In [40]:
df_raw.UsageBand[:5]

0       Low
1       Low
2      High
3      High
4    Medium
Name: UsageBand, dtype: object

In [48]:
df_raw.UsageBand.astype("category")[:5]

0       Low
1       Low
2      High
3      High
4    Medium
Name: UsageBand, dtype: category
Categories (3, object): [High < Low < Medium]

In [46]:
df_raw.UsageBand.astype("category").cat.as_ordered()[:5]

0       Low
1       Low
2      High
3      High
4    Medium
dtype: category
Categories (3, object): [High < Low < Medium]

In [44]:
train_cats(df_raw)

In [50]:
df_raw.UsageBand.cat.codes[:5]

0    1
1    1
2    0
3    0
4    2
dtype: int8

In [51]:
df_raw.UsageBand.cat.categories

Index([u'High', u'Low', u'Medium'], dtype='object')

In [52]:
df_raw.UsageBand.cat.set_categories(["High", "Medium", "Low"], ordered=True, inplace=True)

In [53]:
df_raw.UsageBand.cat.codes[:5]

0    2
1    2
2    0
3    0
4    1
dtype: int8

### Save work done

In [62]:
%mkdir -p tmp
df_raw.to_feather("tmp/bulldozer-raw")

In [64]:
df_raw = pd.read_feather('tmp/bulldozer-raw')

### Redefine proc_df

In [91]:
import re
import sklearn
import warnings

from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler

In [92]:
def get_sample(df, n):
    """ Gets a random sample of n rows from df, without replacement.

    Parameters:
    -----------
    df: A pandas data frame, that you wish to sample from.
    n: The number of rows you wish to sample.

    Returns:
    --------
    return value: A random sample of n rows of df.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    >>> get_sample(df, 2)
       col1 col2
    2     3    a
    1     2    b
    """
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()


def fix_missing(df, col, name, na_dict):
    """ Fill missing data in a column of df with the median, and
    add a {name}_na column which specifies if the data was missing.

    Parameters:
    -----------
    df: The data frame that will be changed.

    col: The column of data to fix by filling in missing data.

    name: The name of the new filled column in df.

    na_dict: A dictionary of values to create na's of and the value
        to insert. If name is not a key of na_dict the median will
        fill any missing data. Also if name is not a key of na_dict
        and there is no missing data in col, then no {name}_na
        column is not created.


    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col1'], 'col1', {})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1     2    2    True
    2     3    2   False


    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col2'], 'col2', {})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2


    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1   500    2    True
    2     3    2   False
    """
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name + '_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict


def scale_vars(df, mapper):
    warnings.filterwarnings('ignore',
                            category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper


def numericalize(df, col, name, max_n_cat):
    """ Changes the column col from a categorical type to its integer codes.

    Parameters:
    -----------
    df: A pandas dataframe. df[name] will be filled with the integer codes from
        col.

    col: The column you wish to change into the categories.
    name: The column name you wish to insert into df. This column will hold the
        integer codes.

    max_n_cat: If col has more categories than max_n_cat it will not change the
        it to its integer codes. If max_n_cat is None, then col will always be
        converted.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> numericalize(df, df['col2'], 'col3', None)

       col1 col2 col3
    0     1    a    1
    1     2    b    2
    2     3    a    1
    """
    if not is_numeric_dtype(col) and (max_n_cat is None or col.nunique() > max_n_cat):
        df[name] = col.cat.codes + 1


def proc_df(df, y_fld, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):

    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable

    skip_flds: A list of fields that dropped from df.

    do_scale: Standardizes each column in df,Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add.
        Na columns are also added if there are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values,
        instead of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable calculates
        the values used for scaling of variables during training time
        (mean and standard deviation).

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the
            response variable and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the
            associated median.

        mapper: A DataFrameMapper which stores the mean and standard deviation
            of the corresponding continous variables which is then used for
            scaling of during test-time.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> x, y, nas = proc_df(df, 'col1')
    >>> x

       col2
    0     1
    1     2
    2     1

    >>> data = DataFrame(
            pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
            children=[4., 6, 3, 3, 2, 3, 5, 4],
            salary=[90, 24, 44, 27, 32, 59, 36, 27])

    >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
                          ([:children], StandardScaler())])

    >>>round(fit_transform!(mapper, copy(data)), 2)

    8x4 Array{Float64,2}:
    1.0  0.0  0.0   0.21
    0.0  1.0  0.0   1.88
    0.0  1.0  0.0  -0.63
    0.0  0.0  1.0  -0.63
    1.0  0.0  0.0  -1.46
    0.0  1.0  0.0  -0.63
    1.0  0.0  0.0   1.04
    0.0  0.0  1.0   0.21
    """
    if not skip_flds: skip_flds = []
    if subset: df = get_sample(df, subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds + [y_fld], axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    for n, c in df.iteritems(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper)
    for n, c in df.iteritems(): numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na=True), y, na_dict]
    if do_scale: res = res + [mapper]
    return res

### Do like all the work in one step

In [66]:
from fastailite.structured import *

In [93]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [98]:
display_all(df.head(1).T.iloc[:10])

Unnamed: 0,0
SalesID,1139246
MachineID,999089
ModelID,3157
datasource,121
auctioneerID,3
YearMade,2004
MachineHoursCurrentMeter,68
UsageBand,3
fiModelDesc,950
fiBaseModel,296


In [100]:
from sklearn.ensemble import RandomForestRegressor

In [101]:
%%time
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)

CPU times: user 2min 3s, sys: 4.55 s, total: 2min 7s
Wall time: 1min 7s


In [102]:
m.score(df, y)

0.98309727338307651

### Take the validation set into account

In [106]:
def split_val(df, n): return df[:n].copy(), df[n:].copy()

In [107]:
n_val = 12000
n_trn = len(df_raw) - n_val

raw_trn, raw_val = split_val(df_raw, n_trn)
X_trn, X_val = split_val(df, n_trn)
y_trn, y_val = split_val(y, n_trn)

In [112]:
import math

In [113]:
def rmse(y_pred, y): return math.sqrt(((y_pred - y) ** 2).mean())

In [114]:
def rf_score(m, X_trn, y_trn, X_val, y_val):
    print("RMSE trn: {}\nRMSE val: {}\nScore trn: {}\nScore val: {}".format(
        rmse(m.predict(X_trn), y_trn),
        rmse(m.predict(X_val), y_val),
        m.score(X_trn, y_trn),
        m.score(X_val, y_val)
    ))

In [116]:
m = RandomForestRegressor(n_jobs=1)
m.fit(X_trn, y_trn)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [117]:
rf_score(m, X_trn, y_trn, X_val, y_val)

RMSE trn: 0.0904485419667
RMSE val: 0.252644151685
Score trn: 0.9829022892
Score val: 0.886010073691
