# A multivariate regression problem

In [1]:
import pandas
import numpy as np
import sklearn.linear_model as lm
from sklearn.model_selection import KFold
from sklearn import preprocessing as pre
import random
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

In [2]:
housing = pandas.read_csv('data/housing.csv')
housing['1h_ocean'] = [1 if i=='<1H OCEAN' else 0 for i in housing.ocean_proximity.values]
housing['island'] = [1 if i=='ISLAND' else 0 for i in housing.ocean_proximity.values]
housing['inland'] = [1 if i=='INLAND' else 0 for i in housing.ocean_proximity.values]
housing['near_ocean'] = [1 if i=='NEAR OCEAN' else 0 for i in housing.ocean_proximity.values]
housing['near_bay'] = [1 if i=='NEAR BAY' else 0 for i in housing.ocean_proximity.values]
housing.drop(columns=['ocean_proximity'], inplace=True)
notna = housing.total_bedrooms.notna()
model = lm.LinearRegression()
model.fit(housing.total_rooms.values[notna].reshape(-1,1), housing.total_bedrooms.values[notna].reshape(-1,1))
model.score(housing.total_rooms.values[notna].reshape(-1,1), housing.total_bedrooms.values[notna].reshape(-1,1))
isna = housing.total_bedrooms.isna()
missing_bedrooms = model.predict(housing.total_rooms.values[isna].reshape(-1,1))
housing.total_bedrooms.loc[isna] = np.squeeze(missing_bedrooms)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
display(housing)

In [3]:
tmp_housing = housing
housing = tmp_housing.copy()

In [4]:
def get_bucket_boundaries(feature_values, num_buckets):
    boundaries = np.arange(0, num_buckets + 1) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    quantiles = [quantiles[q] for q in quantiles.keys()]
    return zip(quantiles[:-1], quantiles[1:])

def bucketize(data, columns):
    for column, buckets in columns:
        if data[column].dtypes == "float64":
            ranges = get_bucket_boundaries(data[column], buckets)
            for r in ranges:
                data[f"{column}%f_to_%f" % r] = data[column].apply(
                    lambda l: 1.0 if l >= r[0] and l < r[1] else 0.0)
        data.drop(columns=[column], inplace=True)

def bucketize_keep(data, columns):
    for column, buckets in columns:
        if data[column].dtypes == "float64":
            ranges = get_bucket_boundaries(data[column], buckets)
            for r in ranges:
                data[f"{column}%f_to_%f" % r] = data[column].apply(
                    lambda l: 1.0 if l >= r[0] and l < r[1] else 0.0)

In [5]:
def preprocess__and_fit(num_buckets, keep=False):
    housing = tmp_housing.copy()

    columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

    buckets = np.empty(len(columns))
    buckets.fill(num_buckets)

    if keep:
        bucketize_keep(housing, zip(columns, buckets))
    else:
        bucketize(housing, zip(columns, buckets))
    
    # First, extract the data into arrays
    y = housing.median_house_value.values.reshape(-1,1)
    X = housing.drop(columns=['median_house_value'], inplace=False).values
    # Pull out 1000 values into a holdout set
    holdout = random.sample(range(0,10640),1000)
    X_holdout = X[holdout]
    y_holdout = y[holdout]
    Xt = np.delete(X, holdout, 0)
    yt = np.delete(y, holdout, 0)

    Model = lm.LinearRegression()
    n_splits=10
    kf = KFold(n_splits=n_splits, shuffle=True)
    train_err = 0
    test_err = 0
    test_err_deneg = 0
    non_neg_count = 0
    for train_index, test_index in kf.split(Xt):
        X_train, X_test = Xt[train_index], Xt[test_index]
        y_train, y_test = yt[train_index], yt[test_index]
        Model.fit(X_train, y_train)
        train_err += Model.score(X_train, y_train)
        test = Model.score(X_test, y_test)
        test_err += test
        if(test > 0):
            test_err_deneg += test
            non_neg_count += 1
        
    print("\nNumber of buckets: " + str(num_buckets))
    print('Average Training Error: ' + str(train_err / n_splits))
    if(non_neg_count > 0):
        print('Average Testing Error: ' + str(test_err_deneg / non_neg_count))
    print('Negative results: ' + str(n_splits - non_neg_count))

In [6]:
#1 to 128 buckets per variable with only one-hot encodings
for i in range(8):
    preprocess__and_fit(2**i)


Number of buckets: 1
Average Training Error: 0.26122765530670167
Average Testing Error: 0.2635063471885331
Negative results: 4

Number of buckets: 2
Average Training Error: 0.46750257758015845
Average Testing Error: 0.46111234334269197
Negative results: 4

Number of buckets: 4
Average Training Error: 0.5900523997721289
Average Testing Error: 0.5848649784238242
Negative results: 3

Number of buckets: 8
Average Training Error: 0.6728780458699222
Average Testing Error: 0.6683638297612626
Negative results: 2

Number of buckets: 16
Average Training Error: 0.7216921025271169
Average Testing Error: 0.7164246282233564
Negative results: 4

Number of buckets: 32
Average Training Error: 0.7541922982308339
Average Testing Error: 0.7451226687726715
Negative results: 3

Number of buckets: 64
Average Training Error: 0.770395416651297
Average Testing Error: 0.7550180986274012
Negative results: 3

Number of buckets: 128
Average Training Error: 0.7805799717912072
Average Testing Error: 0.75272954710391

In [7]:
#25 Buckets per variable keeping original data.
preprocess__and_fit(25, True)


Number of buckets: 25
Average Training Error: 0.7514140960856017
Average Testing Error: 0.7446987931636696
Negative results: 0
