In [7]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [8]:
# Read the data
data1 = pd.read_csv('California_Housing_Data_Part1.csv')

In [9]:
# Train-test split
data1_indep = data1.drop("median_house_value", axis=1)
data1_dep = data1["median_house_value"].copy()

X_train, X_test, y_train, y_test = train_test_split(data1_indep, data1_dep, test_size=0.2, shuffle=True)

In [10]:
# Scale y_train and y_test
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler.transform(y_test.values.reshape(-1, 1))

In [11]:
# Create a pipeline that Standardizes the data, then runs a Randon Forest Regressor
pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('rf_reg', RandomForestRegressor())
])

In [13]:
# Use GridSearchCV to find the best hyperparameters
from sklearn.model_selection import GridSearchCV

# Create a dictionary of hyperparameters to search
param_grid = [
    {'rf_reg__n_estimators': [3, 10, 30], 'rf_reg__max_features': [2, 4, 6, 8]}
]

# Create a grid search object
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)

# Fit to the training data
grid_search.fit(X_train, y_train.ravel())

# Find the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))

Best hyperparameters: {'rf_reg__max_features': 2, 'rf_reg__n_estimators': 30}
Best score: -0.33690315042925
Best RMSE: 0.5804335883020986
