In [11]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import pandas as pd
from scipy.stats import randint

In [12]:
df = pd.read_csv('data/train_processed.csv')
X_train, y_train = df.drop(columns=['SalePrice', 'Id']), df['SalePrice']
X_test = pd.read_csv('data/test-processed.csv')

In [13]:
tree_reg = DecisionTreeRegressor()
params = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': randint(2, 100),
    'min_samples_split': randint(2, 10)
}
random_search = RandomizedSearchCV(
    estimator=tree_reg,
    param_distributions=params,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5
)
random_search.fit(X_train, y_train)

In [14]:
best_model = random_search.best_estimator_

In [15]:
scores = cross_val_score(best_model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
scores = np.sqrt(-scores)
print(scores.mean())
scores

38356.76525937781


array([32490.83870353, 38597.72407942, 34055.16686489, 45652.84252715,
       37344.12769279, 33572.18326521, 35892.9997251 , 34667.25660491,
       57674.49221621, 33620.02091457])

In [16]:
predictions = pd.DataFrame({'Id': X_test['Id'], 'SalePrice': best_model.predict(X_test.drop(columns=['Id']))})
predictions.set_index('Id', inplace=True)
predictions.to_csv('data/predictions/decision-tree.csv')