In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.linear_model import LinearRegression

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

## Assignment 1 ##


### Feature Engineering ###

Consider transformations of existing variables, e.g. the log of the square feet or even "interaction" variables such as the product of bedrooms and bathrooms. Add 4 new variables in both your train_data and test_data. 

- ‘bedrooms_squared’ = ‘bedrooms’*‘bedrooms’
- ‘bed_bath_rooms’ = ‘bedrooms’*‘bathrooms’
- ‘log_sqft_living’ = log(‘sqft_living’)
- ‘lat_plus_long’ =  ‘lat’ + ‘long’

Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.

Bedrooms times bathrooms is what's called an "interaction" variable. It is large when both of them are large.

Taking the log of square feet has the effect of bringing large values closer together and spreading out small values.

Adding latitude to longitude is non-sensical but we will do it anyway (you'll see why)

In [3]:
def add_variables(df):
    """
    add variables in place
    :param df: pandas dataframe of the data set
    """
    df['bedrooms_squared'] = df['bedrooms']*df['bedrooms']
    df['bed_bath_rooms'] = df['bedrooms']*df['bathrooms']
    df['log_sqft_living'] = np.log(df['sqft_living'])
    df['lat_plus_long'] = df['lat'] + df['long']


In [4]:
def calculate_rss(y_actual, y_predict):
    return sum((y_actual - y_predict)**2)

In [5]:
# add variables in place
add_variables(train_data)
add_variables(test_data)

round(test_data[['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']].apply(np.mean),2)

bedrooms_squared    12.45
bed_bath_rooms       7.50
log_sqft_living      7.55
lat_plus_long      -74.65
dtype: float64

### Regression Model ###

- Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’
- Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’
- Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’
(The three models here are “nested” in that all of the features of the Model 1 are in Model 2 and all of the features of Model 2 are in Model 3.)  

Learn all three models on the TRAINING data set.


In [6]:
model_1 = LinearRegression().fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']], train_data['price'])
model_2 = LinearRegression().fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']], train_data['price'])
model_3 = LinearRegression().fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']], train_data['price'])

print(model_1.coef_)
print(model_2.coef_)
print(model_3.coef_)


[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05]
[ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
 -2.94298969e+05  2.55796520e+04]
[ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
 -4.06750711e+05 -6.78858667e+03 -8.57050439e+03 -5.61831484e+05
  1.27334900e+05]


## Assignment 2 ##


### Gradient Descent ###


In [7]:
def get_numpy_data(df, features, output):
    """
    Takes a data set df, a list of features, and a output name, 
    return a 2D array of feature matrix and 1D array of output
    :param df: pandas dataframe of the data set
    :param features: list of the feature names to be selected
    :param output: name of the output
    :return features_matrix: a 2D features_matrix including an additional column of 1s as the constant feature
    :return output_array: a 1D array of output
    """
    # add a constant column
    df['constant'] = 1 
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    
    # convert the features into a numpy matrix
    features_matrix = df[features].to_numpy()
    # convert the pandas Series into a numpy array
    output_array = df[output].to_numpy()
    output_array = output_array.reshape(output_array.shape[0],1)
    
    return features_matrix, output_array


If the features matrix (including a column of 1s for the constant) is stored as a 2D array (or matrix) and the regression weights are stored as a 1D array then the predicted output is just the dot product between the features matrix and the weights (with the weights on the right).

In [8]:
def predict_outcome(feature_matrix, weights):
    """
    Predict outcome based on the feature matrix and weights given
    :param feature_matrix: 2D matrix of features with a shape of N observations * D features (N*D)
    :param weights: 1D array of weights with D features (D*1)
    :return predictions: 1D array of N predicted outcome (N*1)
    """
    predictions = np.dot(feature_matrix, weights)
    predictions = predictions.reshape(predictions.shape[0],1)
    return predictions


If we have a the values of a single input feature in an array ‘feature’ and the prediction ‘errors’ (predictions - output) then the derivative of the regression cost function with respect to the weight of ‘feature’ is just twice the dot product between ‘feature’ and ‘errors’.

In [9]:
def feature_derivative(errors, feature):
    """
    Given an array of features for a single observation and prediction erros, 
    return the gradient of the feature (a single number)
    :param errors: 1D array of errors (N*1)
    :param feature: 1D array of weights for one feature for N observations (N*1)
    :return : single number of derivative for the feature (1*1)
    """
    return -2*np.dot(np.transpose(feature), errors)

Although we can compute the derivative for all the features simultaneously (the gradient) we will explicitly loop over the features individually for simplicity. Write a gradient descent function that does the following:
- Accepts a numpy feature_matrix 2D array, a 1D output array, an array of initial weights, a step size and a convergence tolerance.
- While not converged updates each feature weight by subtracting the step size times the derivative for that feature given the current weights
- At each step computes the magnitude/length of the gradient (square root of the sum of squared components)
- When the magnitude of the gradient is smaller than the input tolerance returns the final weight vector.

In [10]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    """
    Find weights for the regression model through gradient descent
    :param feature_matrix: 2D matrix of features with a shape of N observations * D features (N*D)    
    :param output: 1D array of actual output (N*1)
    :param initial_weights: 1D array of initial weights (1*D)
    :param step_size: number value given to the step size of gradient descent
    :param tolerance: number value of the threshold when is coverged
    :return weights: 1D array of final regression model weights
    """
    converged = False
    weights = np.array(initial_weights)
    
    while not converged:
        # compute the predictions based on feature_matrix and weights 
        predictions = predict_outcome(feature_matrix, weights)  # shape: N*1

        # compute the errors as predictions - output
        errors = predictions - output  # shape: N*1

        gradient_sum_squares = 0 # initialize the gradient
        
        # while not converged, update each weight individually
        for i in range(len(weights)):
            
            # feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]
            derivative = feature_derivative(errors, feature_matrix[:, i])

            # add the squared derivative to the gradient sum squares
            gradient_sum_squares = gradient_sum_squares + derivative**2
            
            # update the weight based on step size and derivative:
            weights[i] = weights[i]+step_size*derivative
        
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights 


#### Model 1 with only one feature: "sqft_living" ####

In [11]:
feature_matrix, output = get_numpy_data(train_data, ['sqft_living'], ['price'])
initial_weights = [-47000., 1.]
step_size = 7e-12 
tolerance = 2.5e7
model_1_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [12]:
model_1_weights

array([-46999.88716555,    281.91211918])

In [13]:
# Use model 1's weights to predict house prices on test data
test_model_1_feature_matrix, test_output = get_numpy_data(test_data, ['sqft_living'], ['price'])
model_1_test_predictions = predict_outcome(test_model_1_feature_matrix, model_1_weights)


print("Predicted price of the first house with model 1 weights: ", model_1_test_predictions[0])

model_1_rss = np.dot((test_output - model_1_test_predictions).T, (test_output - model_1_test_predictions)) 
print("Model 1 RSS on test data: ", model_1_rss)

Predicted price of the first house with model 1 weights:  [356134.443255]
Model 1 RSS on test data:  [[2.75400045e+14]]


#### Model 2 with two features: "sqft_living"  and "sqft_living_15" ####


In [14]:
feature_matrix, output = get_numpy_data(train_data, ['sqft_living', 'sqft_living15'], 'price')
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
model_2_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [15]:
test_model_2_feature_matrix, test_output = get_numpy_data(test_data, ['sqft_living', 'sqft_living15'], 'price')
model_2_test_predictions = predict_outcome(test_model_2_feature_matrix, model_2_weights)

print("Predicted price of the first house with model 1 weights: ", model_2_test_predictions[0])

model_2_rss = np.dot((test_output - model_2_test_predictions).T, (test_output - model_2_test_predictions)) 
print("Model 1 RSS on test data: ", model_2_rss)

Predicted price of the first house with model 1 weights:  [366651.41162949]
Model 1 RSS on test data:  [[2.70263444e+14]]
