# Experimenting with simple models with a hold-out validation set

In [1]:
% matplotlib inline

In [2]:
import pandas as pd, numpy as np

In [3]:
seed = 2018

In [4]:
np.random.seed(seed)

## Data

In [5]:
LABEL = 'price_doc'

In [6]:
train_df = pd.read_csv("../data/train.csv", index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv("../data/test.csv", index_col='id', parse_dates=['timestamp'])
macro_df = pd.read_csv("../data/macro.csv", parse_dates=['timestamp'])

## Incorporate fixes into dataset

In [7]:
df_fixup = pd.read_excel('../data/BAD_ADDRESS_FIX.xlsx', index_col='id')

fix_train_indexset = set(df_fixup[df_fixup.index < train_df.index.max()].index)
bad_train_indexset = set(train_df[train_df.kremlin_km == train_df.kremlin_km.min()].index)
unfixed_set = bad_train_indexset.difference(fix_train_indexset)

train_df.drop(unfixed_set, inplace=True)

full_df = pd.concat([train_df, test_df])

for c in df_fixup.columns:
    full_df.loc[df_fixup.index, c] = df_fixup[c]
    
# Derive training and testing sets back from the fixed full dataframe...
df_fixed = full_df
df_fixed.reset_index(inplace=True)

## Imputation

In [8]:
impute_with_mode_cols = ['material', 'product_type', 'state', 'ID_'] # The categorical columns in our dataset

In [9]:
exclude_imputation_cols = ['price_doc'] # The testing set obviously has NaN price_doc values, so ignore this intentionally
# Get the columns that have NaN values.
impute_cols = [c for c in df_fixed.columns[df_fixed.isnull().any()].tolist() if c not in exclude_imputation_cols]
df_fixed_imputed = df_fixed.copy()

for col in impute_cols:
    # Check if we should impute this with the mode.
    impute_with_mode = False
    if "ID_" in col:
        impute_with_mode = True
    else:
        impute_with_mode = any(icol == col for icol in impute_with_mode_cols)
    
    if impute_with_mode:
        # Impute this column with the mode.
        mode = df_fixed[col].dropna().mode()[0]
        df_fixed_imputed[col] = df_fixed_imputed[col].fillna(value=mode)
    else:
        # Impute this column with the median.
        median = df_fixed[col].dropna().median()
        df_fixed_imputed[col] = df_fixed_imputed[col].fillna(value=median)

In [10]:
# Get the columns that have NaN values.
impute_cols = [c for c in macro_df.columns[macro_df.isnull().any()].tolist()]

In [11]:
macro_df_imputed = macro_df

# These columns store numbers as strings (aka. pandas objects) with commas, so they need some cleaning & parsing...
problematic_cols = ['child_on_acc_pre_school', 'modern_education_share', 'old_education_build_share']

for c in problematic_cols:
    macro_df_imputed[c].replace('#!', np.nan, inplace=True)
    macro_df_imputed[c] = macro_df_imputed.apply(lambda row: str(row['child_on_acc_pre_school']).replace(",","") if row['child_on_acc_pre_school'] else row['child_on_acc_pre_school'], 1)
    macro_df_imputed[c]  = macro_df_imputed[c].astype(np.float32)
    
for col in impute_cols:
    # Check if we should impute this with the mode.
    impute_with_mode = False
    if "ID_" in col:
        impute_with_mode = True
    else:
        impute_with_mode = any(icol == col for icol in impute_with_mode_cols)
    
    if impute_with_mode:
        # Impute this column with the mode.
        print 'Imputing column %s with mode.' % col
        mode = macro_df_imputed[col].dropna().mode()[0]
        macro_df_imputed[col] = macro_df_imputed[col].fillna(value=mode)
    else:
        # Impute this column with the median.
        median = macro_df_imputed[col].dropna().median()
        macro_df_imputed[col] = macro_df_imputed[col].fillna(value=median)

In [12]:
full_df_features = df_fixed_imputed.copy()

## Clean up data

In [13]:
full_df_features.loc[full_df_features['build_year'] == 20052009, 'build_year'] = 2007
full_df_features.loc[full_df_features['build_year'].isin([1, 0, 3, 20, 71, 215, 4965]), 'build_year'] = full_df_features['build_year'].mode()[0]
full_df_features.loc[full_df_features.state == 33, 'state'] = 5
# Some floors have values greater than the max_floor, so let's fix that.
full_df_features.loc[full_df_features.floor > full_df_features.max_floor, ['floor', 'max_floor']] = full_df_features.loc[full_df_features.floor > full_df_features.max_floor, ['floor', 'max_floor']].max(1)

## Some basic temporal features

In [14]:
full_df_features['timestamp_year'] = full_df_features.apply(lambda row: row['timestamp'].year, axis=1)
# The max year in the dataset is 2015. Test set only contains 2015 days.
# This feature will probably help models better generalize, since years closer to 2016 will have higher prices, etc.
full_df_features['timestamp_yearsuntil2016'] = 2016 - full_df_features['timestamp_year']

In [15]:
full_df_features['timestamp_year_bucketized_prepost2013'] = pd.cut(full_df_features['timestamp_year'], [full_df_features['timestamp_year'].min() - 1, 
                                                                                                        2013,
                                                                                                        full_df_features['timestamp_year'].max()])
full_df_features['timestamp_year_bucketized_pre2012post2013'] = pd.cut(full_df_features['timestamp_year'], [full_df_features['timestamp_year'].min() - 1,
                                                                                                            2012,
                                                                                                            2013,
                                                                                                            full_df_features['timestamp_year'].max()])

In [16]:
full_df_features['timestamp_yearsuntil2016_bucketized_prepost2013'] = pd.cut(full_df_features['timestamp_yearsuntil2016'], [full_df_features['timestamp_yearsuntil2016'].min() - 1, 
                                                                                                        3,
                                                                                                        full_df_features['timestamp_yearsuntil2016'].max()])
full_df_features['timestamp_yearsuntil2016_bucketized_pre2012post2013'] = pd.cut(full_df_features['timestamp_yearsuntil2016'], [full_df_features['timestamp_yearsuntil2016'].min() - 1,
                                                                                                            3,
                                                                                                            4,
                                                                                                            full_df_features['timestamp_yearsuntil2016'].max()])

In [17]:
SATURDAY, SUNDAY = 5, 6 # Pandas's weekday codes for the weekend
full_df_features['timestamp_dayofyear'] = full_df_features.apply(lambda row: row['timestamp'].dayofyear, axis=1)
"""
full_df_features['timestamp_dayofweek'] = full_df_features.apply(lambda row: row['timestamp'].dayofweek, axis=1)
full_df_features['timestamp_isweekday'] = full_df_features.apply(lambda row: row['timestamp'].weekday() not in (SATURDAY, SUNDAY), axis=1)
full_df_features['timestamp_dayofmonth'] = full_df_features.apply(lambda row: row['timestamp'].day, axis=1)
full_df_features['timestamp_daysinmonth'] = full_df_features.apply(lambda row: row['timestamp'].daysinmonth, axis=1)
full_df_features['timestamp_dayofyear'] = full_df_features.apply(lambda row: row['timestamp'].dayofyear, axis=1)
"""

"\nfull_df_features['timestamp_dayofweek'] = full_df_features.apply(lambda row: row['timestamp'].dayofweek, axis=1)\nfull_df_features['timestamp_isweekday'] = full_df_features.apply(lambda row: row['timestamp'].weekday() not in (SATURDAY, SUNDAY), axis=1)\nfull_df_features['timestamp_dayofmonth'] = full_df_features.apply(lambda row: row['timestamp'].day, axis=1)\nfull_df_features['timestamp_daysinmonth'] = full_df_features.apply(lambda row: row['timestamp'].daysinmonth, axis=1)\nfull_df_features['timestamp_dayofyear'] = full_df_features.apply(lambda row: row['timestamp'].dayofyear, axis=1)\n"

In [18]:
#full_df_features['timestamp_weekofyear'] = full_df_features.apply(lambda row: row['timestamp'].week, axis=1)

In [19]:
full_df_features['timestamp_month'] = full_df_features.apply(lambda row: row['timestamp'].month, axis=1)

## Split back into training and testing sets.

In [20]:
full_df_features_with_macro = pd.merge(full_df_features, macro_df_imputed, on='timestamp')
full_df_features_with_macro.shape

(38080, 399)

In [21]:
### Drop timestamp column to prevent overfitting
full_df_features_with_macro = full_df_features_with_macro.drop('timestamp', axis=1)

In [22]:
def collect_and_cluster_categorical(df, col, groupsize):
    """
    NOTE: Can also use in Mercedes competition, to cluster certain tests together... Try it out!
    """
    global LABEL
    
    cg_groups = []
    
    cdf = df.groupby(col, as_index=False)[LABEL].median().reset_index().sort_values(LABEL, ascending=False)
        
    for g, _df in cdf.groupby(np.arange(len(cdf)) // groupsize):
        cg_groups.append(_df[col].astype(str).values.tolist())
            
    def resolve_col_group(_col, groups):
        for idx, g in enumerate(groups):
            if _col in g:
                return idx
        return -1
        
    return pd.Series(data=df[col].astype(str).apply(lambda x : resolve_col_group(x, cg_groups)))

### One-hot encoded version

In [23]:
full_df_features_with_macro_onehot = full_df_features_with_macro.copy()

In [24]:
full_df_features_with_macro_onehot['sub_area_index_8'] = collect_and_cluster_categorical(full_df_features_with_macro_onehot, 'sub_area', full_df_features_with_macro_onehot['sub_area'].unique().shape[0] // 8)
full_df_features_with_macro_onehot['sub_area_index_10'] = collect_and_cluster_categorical(full_df_features_with_macro_onehot, 'sub_area', full_df_features_with_macro_onehot['sub_area'].unique().shape[0] // 10)

In [25]:
full_df_features_with_macro_onehot = pd.get_dummies(full_df_features_with_macro_onehot)
full_df_features_with_macro_onehot.shape

(38080, 568)

In [26]:
id_cols = [c for c in full_df_features_with_macro.columns if 'ID_' in c]
special_categoricals = ['state', 'material','sub_area_index_8', 'sub_area_index_10' ]
implicit_categorical_cols = id_cols + special_categoricals

In [27]:
full_df_features_with_macro_onehot = pd.get_dummies(full_df_features_with_macro_onehot, columns=implicit_categorical_cols)

In [28]:
# Train has price_doc but no id
# Test has id, but no price doc
train_df_withmacro_onehot = full_df_features_with_macro_onehot[full_df_features_with_macro_onehot.price_doc.notnull()]
test_df_withmacro_onehot = full_df_features_with_macro_onehot[full_df_features_with_macro_onehot.price_doc.isnull()]
test_df_withmacro_onehot = test_df_withmacro_onehot.drop(['price_doc'], axis=1)
train_df_withmacro_onehot = train_df_withmacro_onehot.drop('id', axis=1)
train_df_withmacro_onehot.shape, test_df_withmacro_onehot.shape

((30418, 1209), (7662, 1209))

### Label-encoded version

In [29]:
from sklearn.preprocessing import LabelEncoder

In [30]:
full_df_features_with_macro_labelecoded = full_df_features_with_macro.copy()

In [31]:
for c in ['big_market_raion',
          'big_road1_1line',
          'culture_objects_top_25',
          'detention_facility_raion',
          'ecology',
          'incineration_raion',
          'nuclear_reactor_raion',
          'oil_chemistry_raion',
          'product_type',
          'radiation_raion',
          'railroad_1line',
          'railroad_terminal_raion',
          'thermal_power_plant_raion',
          'water_1line']:
    full_df_features_with_macro_labelecoded[c] = LabelEncoder().fit_transform(full_df_features_with_macro_labelecoded[c].values)

In [32]:
full_df_features_with_macro_labelecoded['sub_area_index_8'] = collect_and_cluster_categorical(full_df_features_with_macro_labelecoded, 'sub_area', full_df_features_with_macro_labelecoded['sub_area'].unique().shape[0] // 8)
full_df_features_with_macro_labelecoded['sub_area_index_10'] = collect_and_cluster_categorical(full_df_features_with_macro_labelecoded, 'sub_area', full_df_features_with_macro_labelecoded['sub_area'].unique().shape[0] // 10)

In [33]:
full_df_features_with_macro_labelecoded['sub_area'] = LabelEncoder().fit_transform(full_df_features_with_macro_labelecoded['sub_area'].values)

In [34]:
for c in full_df_features_with_macro_labelecoded.select_dtypes(include=[np.bool]).columns.tolist():
    full_df_features_with_macro_labelecoded[c] = full_df_features_with_macro_labelecoded[c].astype(int)

In [35]:
for c in full_df_features_with_macro_labelecoded.select_dtypes(exclude=["number","bool_","object_"]).columns.tolist():
    full_df_features_with_macro_labelecoded[c] = full_df_features_with_macro_labelecoded[c].cat.codes.astype(int)

In [36]:
train_df_with_macro_labelecoded = full_df_features_with_macro_labelecoded[full_df_features_with_macro_labelecoded.price_doc.notnull()]
test_df_with_macro_labelecoded = full_df_features_with_macro_labelecoded[full_df_features_with_macro_labelecoded.price_doc.isnull()]
test_df_with_macro_labelecoded = test_df_with_macro_labelecoded.drop('price_doc', axis=1)
train_df_with_macro_labelecoded = train_df_with_macro_labelecoded.drop('id', axis=1)
train_df_with_macro_labelecoded.shape, test_df_with_macro_labelecoded.shape

((30418, 399), (7662, 399))

## Create hold-out validation set
Last 2 months of training set in 2015.

In [37]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [38]:
def rmsle(y_true, y_preds):
    return sqrt(mean_squared_error(y_pred=np.log1p(y_preds), y_true=np.log1p(y_true)))

In [39]:
def clean_inputs(df, is_train=True, transform_target=True, exclude_cols=['timestamp_year', 'timestamp_month', 'timestamp_dayofyear']):
    if is_train:
        return df.drop(['price_doc'] + exclude_cols, axis=1), np.log1p(df['price_doc'].values)
    else:
        # Testing set
        return df.drop(['id'] + exclude_cols, axis=1)

In [40]:
def split_into_train_and_val(df, last_n_months=2, last_n_weeks=6, use_months=True):
    if use_months:
        max_month_in_2015 = df[df.timestamp_year == 2015 ].timestamp_month.max()
        val_df = df[(df.timestamp_year == 2015) & (df.timestamp_month >= max_month_in_2015 - last_n_months)]
        train_df = df[~df.index.isin(val_df.index)]
        return train_df, val_df
    else:
        max_week_in_2015 = (df[df.timestamp_year == 2015 ].timestamp_dayofyear // 7).max()
        val_df = df[(df.timestamp_year == 2015) & ( (df.timestamp_dayofyear // 7) >= max_week_in_2015 - last_n_weeks)]
        train_df = df[~df.index.isin(val_df.index)]
        return train_df, val_df

In [41]:
def evaluate_model_on_holdout_set(estimator, train, val, transform_target=True, exclude_cols=['timestamp_year', 'timestamp_month', 'timestamp_dayofyear']):
    print 'Fitting model on training set...'
    estimator.fit(*clean_inputs(train, is_train=True, transform_target=transform_target, exclude_cols = exclude_cols))
    val_input, val_output = clean_inputs(val, is_train=True, transform_target=transform_target, exclude_cols = exclude_cols)
    print 'Predicting the hold-out validation set...'
    y_preds = estimator.predict(val_input)
    y_true = val_output
    print 'Scoring model...'
    if not transform_target:
        score = rmsle(y_true, y_preds)
    else:
        score = sqrt(mean_squared_error(y_pred=y_preds, y_true=y_true))
    print 'RMSLE: ', score
    return score

# Model Type 1: Linear models

In [49]:
linear_train, linear_val = split_into_train_and_val(train_df_withmacro_onehot)

In [50]:
linear_train_after_2013, linear_val_after_2013 = split_into_train_and_val(train_df_withmacro_onehot)
linear_train_after_2013 = linear_train_after_2013[linear_train_after_2013.timestamp_year >= 2013]

In [42]:
linear_train_after_2014, linear_val_after_2014 = split_into_train_and_val(train_df_withmacro_onehot)
linear_train_after_2014 = linear_train_after_2014[linear_train_after_2014.timestamp_year >= 2014]

In [43]:
linear_train_after_2015, linear_val_after_2015 = split_into_train_and_val(train_df_withmacro_onehot)
linear_train_after_2015 = linear_train_after_2015[linear_train_after_2015.timestamp_year >= 2015]

In [44]:
linear_train_before_2013, linear_val_before_2013 = split_into_train_and_val(train_df_withmacro_onehot)
linear_train_before_2013 = linear_train_before_2013[linear_train_before_2013.timestamp_year < 2013]

## Linear

In [88]:
from sklearn.linear_model import LinearRegression

In [89]:
linear_reg = LinearRegression(normalize=True)

In [90]:
linear_reg.fit(*clean_inputs(train_df_withmacro_onehot))
kaggle_linearreg_preds = np.expm1(linear_reg.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_linearreg_preds

  


array([  4.92658224e+06,   8.21830777e+06,   4.77500720e+06, ...,
        -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00])

In [91]:
linear_reg.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year < 2013]))
kaggle_linearreg_lt_2013_preds = np.expm1(linear_reg.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_linearreg_lt_2013_preds

  


array([ inf,  inf,  inf, ...,  inf,  inf,  inf])

In [92]:
linear_reg.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year >= 2013]))
kaggle_linenarreg_gt_2013_preds = np.expm1(linear_reg.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_linenarreg_gt_2013_preds

  


array([-1., -1.,  0., ..., -1., -1., -1.])

In [94]:
kaggle_preds_df['kaggle_linearreg_preds'] = kaggle_linearreg_preds

In [51]:
evaluate_model_on_holdout_set(linear_reg, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  4664377886.03


4664377886.027782

In [52]:
evaluate_model_on_holdout_set(linear_reg, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  95362977.9057


95362977.90569763

In [54]:
evaluate_model_on_holdout_set(linear_reg, linear_train_after_2014, linear_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  181923163.592


181923163.59201717

In [134]:
evaluate_model_on_holdout_set(linear_reg, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  5.40952448196e+13


54095244819573.61

## Lasso

In [95]:
from sklearn.linear_model import Lasso

In [96]:
lasso_reg = Lasso(normalize=True, random_state=seed)

In [97]:
lasso_reg.fit(*clean_inputs(train_df_withmacro_onehot))
kaggle_lasso_preds = np.expm1(lasso_reg.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_lasso_preds

array([ 6013401.97824096,  6013401.97824096,  6013401.97824096, ...,
        6013401.97824096,  6013401.97824096,  6013401.97824096])

In [98]:
lasso_reg.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year < 2013]))
kaggle_lasso_lt_2013_preds = np.expm1(lasso_reg.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_lasso_lt_2013_preds

array([ 5196427.85649383,  5196427.85649383,  5196427.85649383, ...,
        5196427.85649383,  5196427.85649383,  5196427.85649383])

In [59]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year >= 2013]))
kaggle_ridge_gt_2013_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_ridge_gt_2013_preds

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.41961161477e-19


array([ 4052131.32199135,  8042745.99926154,  4149170.80608331, ...,
        1407854.17254524,  1486680.00736567,  2643018.00315866])

In [57]:
evaluate_model_on_holdout_set(lasso_reg, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.589788905493


0.5897889054928692

In [58]:
evaluate_model_on_holdout_set(lasso_reg, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.58271082583


0.5827108258296179

In [59]:
evaluate_model_on_holdout_set(lasso_reg, linear_train_after_2014, linear_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.577253324596


0.577253324595594

In [60]:
evaluate_model_on_holdout_set(lasso_reg, linear_train_after_2015, linear_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.573223247528


0.5732232475284266

In [139]:
evaluate_model_on_holdout_set(lasso_reg, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.637072653479


0.6370726534786059

## BayesianRidge

In [42]:
from sklearn.linear_model import BayesianRidge

In [46]:
linear_model = BayesianRidge(normalize=True)

In [44]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot))
kaggle_bayesia_ridge_all = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_bayesia_ridge_all

array([ 5072930.70733216,  8455056.98343063,  5342883.61615634, ...,
        4352316.09653251,  5061583.8424752 ,  8583082.56347125])

In [47]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year < 2013]))
kaggle_bayesia_ridge_lt_2013 = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_bayesia_ridge_lt_2013

array([ 5073646.86511655,  5347877.34348044,  5984324.79417417, ...,
        3853669.69833371,  5946474.85490824,  7283669.8976881 ])

In [48]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year >= 2013]))
kaggle_bayesia_ridge_gt_2013 = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_bayesia_ridge_gt_2013

array([ 4896599.78832916,  8886107.05283952,  5040037.03599187, ...,
        4249929.40380958,  4892064.77281312,  8367490.90122886])

In [63]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.407259145589


0.40725914558891596

In [64]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.389724269438


0.38972426943819344

In [65]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2014, linear_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.389942275359


0.3899422753591761

In [66]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2015, linear_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  7.58975879887


7.589758798867985

In [67]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.486200072658


0.4862000726580878

## ElasticNet

In [99]:
from sklearn.linear_model import ElasticNet

In [100]:
linear_model = ElasticNet(normalize=True, random_state=seed)

In [101]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot))
kaggle_elasticnet_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_elasticnet_preds

array([ 6013401.97824096,  6013401.97824096,  6013401.97824096, ...,
        6013401.97824096,  6013401.97824096,  6013401.97824096])

In [58]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year < 2013]))
kaggle_ridge_lt_2013_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_ridge_lt_2013_preds

array([ 2666333.18500723,  1856988.92364304,  2840593.6201275 , ...,
          10339.72061306,    16429.11509716,    19049.81982119])

In [59]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year >= 2013]))
kaggle_ridge_gt_2013_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_ridge_gt_2013_preds

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.41961161477e-19


array([ 4052131.32199135,  8042745.99926154,  4149170.80608331, ...,
        1407854.17254524,  1486680.00736567,  2643018.00315866])

In [70]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.589788905493


0.5897889054928692

In [71]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.58271082583


0.5827108258296179

In [72]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2014, linear_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.577253324596


0.577253324595594

In [73]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2015, linear_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.573223247528


0.5732232475284266

In [74]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.637072653479


0.6370726534786059

## HuberRegressor

In [63]:
from sklearn.linear_model import HuberRegressor

In [64]:
linear_model = HuberRegressor()

In [65]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  2.85960401626


2.859604016262771

In [66]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.64817357584


0.6481735758397739

In [67]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  4.37539452953


4.375394529531269

## Lars

In [68]:
from sklearn.linear_model import Lars

In [217]:
linear_model = Lars(normalize=True)

In [218]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...




Predicting the hold-out validation set...
Scoring model...
RMSLE:  81.8215923492


81.82159234924114

In [219]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...




Predicting the hold-out validation set...
Scoring model...
RMSLE:  6.9164112614


6.916411261397162

In [220]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...




Predicting the hold-out validation set...
Scoring model...
RMSLE:  25365318.5468


25365318.546809282

## LassoLars

In [69]:
from sklearn.linear_model import LassoLars

In [70]:
linear_model = LassoLars(normalize=True)

In [71]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.589788905493


0.5897889054928692

In [72]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.58271082583


0.5827108258296179

In [73]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.637072653479


0.6370726534786059

## PassiveAggressiveRegressor

In [226]:
from sklearn.linear_model import PassiveAggressiveRegressor

In [227]:
linear_model = PassiveAggressiveRegressor(random_state=seed)

In [228]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  1.0067895609


1.0067895608998327

In [229]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  2.74111635008


2.7411163500765126

In [230]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  9.82195853725


9.821958537246783

## RANSACRegressor

In [74]:
from sklearn.linear_model import RANSACRegressor

In [75]:
linear_model = RANSACRegressor(random_state=seed)

In [233]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  1.08517298745


1.085172987446868

In [234]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  10.9210214058


10.921021405751018

In [235]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  150.972497865


150.97249786461978

## SGDRegressor

In [236]:
from sklearn.linear_model import SGDRegressor

In [237]:
linear_model = SGDRegressor()

In [238]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  2.13617116673e+23


2.136171166728495e+23

In [239]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  3.54946788431e+23


3.5494678843092246e+23

In [240]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  4.03260923277e+23


4.03260923276632e+23

## Ridge

In [50]:
from sklearn.linear_model import Ridge

In [51]:
linear_model = Ridge()

In [54]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot))
kaggle_ridge_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_ridge_preds

array([ 4842170.677799  ,  8178797.90242163,  4789029.95825509, ...,
        3293022.17695443,  3484393.99417835,  6124221.85978975])

In [58]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year < 2013]))
kaggle_ridge_lt_2013_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_ridge_lt_2013_preds

array([ 2666333.18500723,  1856988.92364304,  2840593.6201275 , ...,
          10339.72061306,    16429.11509716,    19049.81982119])

In [59]:
linear_model.fit(*clean_inputs(train_df_withmacro_onehot[train_df_withmacro_onehot.timestamp_year >= 2013]))
kaggle_ridge_gt_2013_preds = np.expm1(linear_model.predict(clean_inputs(test_df_withmacro_onehot, is_train=False)))
kaggle_ridge_gt_2013_preds

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.41961161477e-19


array([ 4052131.32199135,  8042745.99926154,  4149170.80608331, ...,
        1407854.17254524,  1486680.00736567,  2643018.00315866])

In [78]:
evaluate_model_on_holdout_set(linear_model, linear_train, linear_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.597056509628


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.5962897319e-19


0.5970565096277155

In [79]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2013, linear_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.407420487756


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 4.41029880506e-19


0.4074204877564264

In [80]:
evaluate_model_on_holdout_set(linear_model, linear_train_after_2014, linear_val_after_2014)
evaluate_model_on_holdout_set(linear_model, linear_train_after_2015, linear_val_after_2015)

Fitting model on training set...


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 7.57625770681e-19


Predicting the hold-out validation set...
Scoring model...
RMSLE:  2.08081613636
Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.524015603996


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 4.14172115989e-18


0.524015603995574

In [81]:
evaluate_model_on_holdout_set(linear_model, linear_train_before_2013, linear_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.642493114338


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 6.57215497714e-19


0.642493114338294

# Model Type 2: Tree Models

In [60]:
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor

In [61]:
tree_train, tree_val = split_into_train_and_val(train_df_with_macro_labelecoded)

In [62]:
tree_train_after_2013, tree_val_after_2013 = split_into_train_and_val(train_df_with_macro_labelecoded)
tree_train_after_2013 = tree_train_after_2013[tree_train_after_2013.timestamp_year >= 2013]

In [63]:
tree_train_after_2014, tree_val_after_2014 = split_into_train_and_val(train_df_with_macro_labelecoded)
tree_train_after_2014 = tree_train_after_2014[tree_train_after_2014.timestamp_year >= 2014]

In [64]:
tree_train_after_2015, tree_val_after_2015 = split_into_train_and_val(train_df_with_macro_labelecoded)
tree_train_after_2015 = tree_train_after_2015[tree_train_after_2015.timestamp_year >= 2015]

In [65]:
tree_train_before_2013, tree_val_before_2013 = split_into_train_and_val(train_df_with_macro_labelecoded)
tree_train_before_2013 = tree_train_before_2013[tree_train_before_2013.timestamp_year < 2013]

### Decision Tree Regressor

In [181]:
tree_model = DecisionTreeRegressor(random_state=seed, max_depth=5)

In [182]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.413698821596


0.4136988215958497

In [183]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.406549361789


0.40654936178851203

In [184]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.549659588745


0.5496595887451315

### Extra Trees Regressor

In [94]:
tree_model = ExtraTreeRegressor(random_state=seed, max_depth=7)

In [95]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.425424969122


0.4254249691222213

In [96]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.405009379885


0.4050093798852482

In [97]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2014, tree_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.410980573481


0.41098057348081163

In [98]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2015, tree_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.460805136397


0.4608051363974241

In [99]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.73276169334


0.7327616933399277

### RF Regressor (+ TSFRESH, bagging)

In [100]:
tree_model = RandomForestRegressor(random_state=seed)

In [101]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.423356369114


0.42335636911406616

In [102]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.424404192268


0.42440419226755993

In [103]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2014, tree_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.406819086963


0.40681908696264024

In [104]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2015, tree_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.413808415042


0.41380841504150145

In [121]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.782869061288


0.7828690612877902

#### Bagging

In [66]:
tree_model = BaggingRegressor(RandomForestRegressor(random_state=seed),random_state=seed, n_jobs=-1)

In [67]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded))

BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=2018,
           verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=-1, oob_score=False,
         random_state=2018, verbose=0, warm_start=False)

In [68]:
kaggle_preds_bagging_et_only = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_preds_bagging_et_only

array([ 5530148.2031598 ,  8535245.46775945,  6080294.03709945, ...,
        4217339.13400513,  4811837.3970377 ,  6939269.36360733])

In [78]:
((kaggle_preds_bagging_et_only + kaggle_ridge_preds)/2.0)[:10]

array([ 5186159.4404794 ,  8357021.68509054,  5434661.99767727,
        6777532.50849252,  5139355.33530886,  9729432.03199963,
        4746683.09396864,  4079728.12685391,  4564851.5747225 ,
        5250551.50597019])

In [79]:
kaggle_preds_df = test_df_with_macro_labelecoded.copy()

In [80]:
kaggle_preds_df['kaggle_bayesia_ridge_all'] = kaggle_bayesia_ridge_all
kaggle_preds_df['kaggle_bayesia_ridge_gt_2013'] = kaggle_bayesia_ridge_gt_2013
kaggle_preds_df['kaggle_bayesia_ridge_lt_2013'] = kaggle_bayesia_ridge_lt_2013
kaggle_preds_df['kaggle_ridge_gt_2013_preds'] = kaggle_ridge_gt_2013_preds
kaggle_preds_df['kaggle_ridge_lt_2013_preds'] = kaggle_ridge_lt_2013_preds
kaggle_preds_df['kaggle_ridge_preds'] = kaggle_ridge_preds
kaggle_preds_df['kaggle_preds_bagging_et_only'] = kaggle_preds_bagging_et_only

In [84]:
kaggle_preds_df[['id'] + [c for c in kaggle_preds_df.columns if 'kaggle_' in c]].to_csv("kaggle_submission.csv", index=False)

In [106]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.395289145804


0.3952891458044496

In [107]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2015, tree_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.397087136018


0.3970871360175275

In [108]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2014, tree_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.385612991503


0.3856129915025121

In [109]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.395517821881


0.39551782188098333

In [121]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.782869061288


0.7828690612877902

### GBM Regressor

In [111]:
tree_model = GradientBoostingRegressor(random_state=seed)

In [108]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.374294998657


0.37429499865703336

In [109]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.371540085863


0.3715400858631102

In [110]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2014, tree_val_after_2014)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.372608789312


0.3726087893123304

In [105]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2015, tree_val_after_2015)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.41449796736


0.41449796736021355

In [106]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.505132166054


0.5051321660537982

In [112]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded))
kaggle_gbm_all_preds = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_gbm_all_preds

array([ 5506914.76210193,  8301425.87197396,  5022964.86436411, ...,
        4640324.66079217,  5438347.01933234,  8829637.0897267 ])

In [113]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded[train_df_with_macro_labelecoded.timestamp_year < 2013]))
kaggle_gbm_lt_2013_preds = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_gbm_lt_2013_preds

array([ 4383420.51604563,  5616894.82528458,  4648018.56427856, ...,
        3556516.30492439,  4973710.17159678,  7846434.08623438])

In [114]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded[train_df_with_macro_labelecoded.timestamp_year >= 2013]))
kaggle_gbm_gt_2013_preds = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_gbm_gt_2013_preds

array([ 5622788.73336377,  8318024.0445459 ,  5168531.67533866, ...,
        4441114.89722342,  5561661.03329708,  8939657.07496249])

In [115]:
kaggle_preds_df['kaggle_gbm_all_preds'] = kaggle_gbm_all_preds
kaggle_preds_df['kaggle_gbm_lt_2013_preds'] = kaggle_gbm_lt_2013_preds
kaggle_preds_df['kaggle_gbm_gt_2013_preds'] = kaggle_gbm_gt_2013_preds

In [118]:
kaggle_preds_df[['id'] + [c for c in kaggle_preds_df.columns if 'kaggle_' in c]].to_csv("kaggle_submission.csv", index=False)

#### Bagging GBM

In [116]:
tree_model = BaggingRegressor(GradientBoostingRegressor(random_state=seed),random_state=seed, n_jobs=-1)

In [117]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...
Predicting the hold-out validation set...
Scoring model...
RMSLE:  0.377865335333


0.37786533533285394

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2014, tree_val_after_2014)

In [119]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded))
kaggle_bagginng_gbm_all_preds = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_bagginng_gbm_all_preds

array([ 5372726.72640079,  8258075.02676723,  5259091.04532245, ...,
        4376741.80753391,  5509944.31250031,  8769480.11869137])

In [120]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded[train_df_with_macro_labelecoded.timestamp_year < 2013]))
kaggle_bagginng_gbm_lt_2013_preds = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_bagginng_gbm_lt_2013_preds

array([ 3613600.74665527,  4531660.40525576,  4006445.04311687, ...,
        2625868.69293379,  4366591.94700641,  6256571.80922525])

In [121]:
tree_model.fit(*clean_inputs(train_df_with_macro_labelecoded[train_df_with_macro_labelecoded.timestamp_year >= 2013]))
kaggle_bagginng_gbm_gt_2013_preds = np.expm1(tree_model.predict(clean_inputs(test_df_with_macro_labelecoded, is_train=False)))
kaggle_bagginng_gbm_gt_2013_preds

array([ 5494072.88691165,  8172539.84617735,  5168034.60786757, ...,
        4491155.67087552,  5524410.10157204,  9082169.89383017])

In [122]:
kaggle_preds_df['kaggle_bagginng_gbm_all_preds'] = kaggle_bagginng_gbm_all_preds
kaggle_preds_df['kaggle_bagginng_gbm_lt_2013_preds'] = kaggle_bagginng_gbm_lt_2013_preds
kaggle_preds_df['kaggle_bagginng_gbm_gt_2013_preds'] = kaggle_bagginng_gbm_gt_2013_preds

In [123]:
kaggle_preds_df[['id'] + [c for c in kaggle_preds_df.columns if 'kaggle_' in c]].to_csv("kaggle_submission.csv", index=False)

### Adaboost Regressor

In [262]:
tree_model = RandomForestRegressor(random_state=seed)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...


In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

### XGB Regressor

In [109]:
import xgboost as xgb



OSError: /usr/local/lib/python2.7/dist-packages/xgboost/./lib/libxgboost.so: invalid ELF header

In [125]:
! sudo gsutil cp kaggle_submission.csv gs://pencil-app/kaggle_sberbank/data_newest

Copying file://kaggle_submission.csv [Content-Type=text/csv]...
/ [1 files][  1.4 MiB/  1.4 MiB]                                                
Operation completed over 1 objects/1.4 MiB.                                      


In [262]:
tree_model = RandomForestRegressor(random_state=seed)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...


In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)

### Bagging regressors
- Random Forests
- ET
- Linear model types
- DTs
- XGBoostRegressor

In [262]:
tree_model = RandomForestRegressor(random_state=seed)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train, tree_val)

Fitting model on training set...


In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_after_2013, tree_val_after_2013)

In [None]:
evaluate_model_on_holdout_set(tree_model, tree_train_before_2013, tree_val_before_2013)