In [2]:
from sklearn import datasets

housing = datasets.fetch_california_housing()

print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [4]:
x = housing.data
x

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8))

In [6]:
y = housing.target
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,))

In [7]:
print(housing.feature_names)
print(x[0])
print(y[0])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526


In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=132)

In [10]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

In [11]:
y_pred = model.predict(x_test)
y_pred

array([2.77296098, 1.79309491, 2.93956271, ..., 1.748908  , 3.15014988,
       2.19999767], shape=(4128,))

In [12]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
r2

0.6206655206340936

In [14]:
from sklearn.preprocessing import PolynomialFeatures

housing = datasets.fetch_california_housing()

x = housing.data
y = housing.target

print(x.shape)
poly = PolynomialFeatures()
x = poly.fit_transform(x)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=132)

model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)

print(r2)

(20640, 8)
(20640, 45)
0.05041774813792632


In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor
)

housing = datasets.fetch_california_housing()

x = housing.data
y = housing.target

print(x.shape)
poly = PolynomialFeatures()
x = poly.fit_transform(x)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=132)

LR = LinearRegression()
GBR = GradientBoostingRegressor()
HGBR = HistGradientBoostingRegressor()
RFR = RandomForestRegressor(n_jobs=-1)

for model in [LR, HGBR, GBR, RFR]:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    r2 = r2_score(y_test, y_pred)

    print(model, r2)

(20640, 8)
(20640, 45)
LinearRegression() 0.05041774813792632
HistGradientBoostingRegressor() 0.8421075207106447
GradientBoostingRegressor() 0.7996692336629234
RandomForestRegressor(n_jobs=-1) 0.8110615895772423


In [19]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor
)

housing = datasets.fetch_california_housing()

x = housing.data
y = housing.target

print(x.shape)
poly = PolynomialFeatures()
x = poly.fit_transform(x)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=132)

for i in [100, 150, 200, 250, 300, 350]:
    model = HistGradientBoostingRegressor(
        max_iter=i
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    r2 = r2_score(y_test, y_pred)

    print(i, r2)

(20640, 8)
(20640, 45)
100 0.8398098821950215
150 0.8433832913948086
200 0.8454495378683611
250 0.8464113088303457
300 0.8453680232955336
350 0.8508346252105137


In [20]:
for j in [0.1, 0.05, 0.001]:
    for i in [100, 150, 200, 250, 300, 350]:
        model = HistGradientBoostingRegressor(
            max_iter=i,
            learning_rate=j
        )
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        r2 = r2_score(y_test, y_pred)
    
        print(j, i, r2)

0.1 100 0.8425832507213322
0.1 150 0.8440693552122956
0.1 200 0.8494476241456695
0.1 250 0.8422960743677793
0.1 300 0.8499721520437822
0.1 350 0.8432017820467544
0.05 100 0.8291932442519575
0.05 150 0.8385208413782308
0.05 200 0.841770853790527
0.05 250 0.8455254734511402
0.05 300 0.8458576920173548
0.05 350 0.8421188936298266
0.001 100 0.12012228135929282
0.001 150 0.17205655096204053
0.001 200 0.22165006838102674
0.001 250 0.2641818227066177
0.001 300 0.30604949332594167
0.001 350 0.34267619952562167


In [21]:
import joblib

model = HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.1
)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)

joblib.dump(model, "my_model.joblib")

print(r2)

0.8456844587181626


In [23]:
local_model = joblib.load("my_model.joblib")

ypred = local_model.predict(x_test)
r2 = r2_score(y_test, ypred)

print(r2)

0.8456844587181626
