In [None]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

We are training two algorithms on California housing dataset, we are trying to predict the median house price of a california district based on a variety of features

In [10]:
train = pd.read_csv("california_housing.csv")
a = train.to_numpy()
X = a[:, [0,5]]
y = a[:, 6]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

sc_y = StandardScaler()
sc_y.fit(y_train[:, np.newaxis])
y_train_std = sc_y.transform(y_train[:, np.newaxis]).flatten()
y_test_std = sc_y.transform(y_test[:, np.newaxis]).flatten()

print(X)
print(y)

[[41.      8.3252]
 [21.      8.3014]
 [52.      7.2574]
 ...
 [17.      1.7   ]
 [18.      1.8672]
 [16.      2.3886]]
[452600. 358500. 352100. ...  92300.  84700.  89400.]


Training sklearn's LinearRegression on the Boston housing dataset

In [15]:
est = LinearRegression()
est.fit(X_train_std, y_train_std)

y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train_std, y_train_pred),
        mean_squared_error(y_test_std, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train_std, y_train_pred),
        r2_score(y_test_std, y_test_pred)))

MSE train: 0.487, test: 0.491
R^2 train: 0.513, test: 0.500


Training sklearn's RandomForestRegressor on the Boston housing dataset

In [14]:
est = RandomForestRegressor(n_estimators=1000, 
                            criterion='mse', 
                            random_state=1, 
                            n_jobs=-1)
est.fit(X_train_std, y_train_std)

y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train_std, y_train_pred),
        mean_squared_error(y_test_std, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train_std, y_train_pred),
        r2_score(y_test_std, y_test_pred)))

MSE train: 0.085, test: 0.556
R^2 train: 0.915, test: 0.434


Both the algorithms are not that great at preditcting the housing prices based on the given features. The linear regression algorithm only got an R^2 of zero on the training an the random forest regressor got around 0.915 on the training set. But the RandomForestRegressor only got around 0.434 in the testing set which indicated overfitting of the data.