In [10]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('housing.csv')

In [30]:
numeric_columns = [col for col in df.columns if df[col].dtypes == 'float64' and col != 'median_house_value']
numeric_columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

# 1

In [5]:
df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [6]:
df.total_bedrooms.isna().sum()

207

# 2

In [8]:
df.population.median()

1166.0

# 3

In [17]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [18]:
y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [69]:
def prepare_X(df, use_mean=False):
    df_num = df[numeric_columns]
    if use_mean:
        df_num = df_num.fillna(df.mean(numeric_only=True))
    else:
        df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [53]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [54]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [73]:
def count_rmse(use_mean):
    X_train = prepare_X(df_train, use_mean)
    w_0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)

    return(rmse(y_val, y_pred))

In [75]:
count_rmse(use_mean=False) < count_rmse(use_mean=True)

False

# 4

In [83]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [84]:
X_train = prepare_X(df_train)

In [103]:
rmse_list = []
reg_params = [0, 0.000001, 0.0001, 0.001]
for r in reg_params:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    rmse_list.append(rmse(y_val, y_pred))

In [104]:
reg_params[np.where(rmse_list == min(rmse_list))[0][0]]

0.001

# 5

In [107]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [110]:
def score_std(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.std(mse)

In [119]:
rmse_scores = []
for seed in seeds:
    np.random.seed(seed)

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    rmse_scores.append(count_rmse(use_mean=False))

In [122]:
round(np.std(rmse_scores), 3)

0.004

# 6

In [123]:
seed = 9

In [128]:
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.001)
y_pred = w_0 + X_val.dot(w)
round(rmse(y_val, y_pred), 3)

0.337