In [26]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('data/train_processed.csv')
X_train, y_train = df.drop(columns=['SalePrice']), df['SalePrice']
X_test = pd.read_csv('data/test-processed.csv')

In [28]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [29]:
forest_reg = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=forest_reg,
    param_distributions=param_dist,
    n_iter=100,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

scores = cross_val_score(best_model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
scores = np.sqrt(-scores)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [30]:

X_test.fillna(method='ffill', inplace=True)

predictions = pd.DataFrame({'Id': X_test['Id'], 'SalePrice': best_model.predict(X_test)})
predictions.set_index('Id', inplace=True)
predictions.to_csv('data/predictions/random-forest.csv')

In [31]:
scores.mean()

29062.550732839365