In [1]:
import pandas as pd
import numpy as np
import lxml

df = pd.read_csv('AB_NYC_2019.csv')

df = df[['latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365'
]]
df.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.64749,-73.97237,149,1,9,0.21,6,365
1,40.75362,-73.98377,225,1,45,0.38,2,355
2,40.80902,-73.9419,150,3,0,,1,365
3,40.68514,-73.95976,89,1,270,4.64,1,194
4,40.79851,-73.94399,80,10,9,0.1,1,0


Q1) Find a feature with missing values. How many missing values does it have?

In [2]:
df.isnull().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

Q2) What's the median (50% percentile) for variable 'minimum_nights'?

In [3]:
df['minimum_nights'].median()

3.0

In [4]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [5]:
n_val, n_test, n_train

(9779, 9779, 29337)

In [6]:
df_train = df.iloc[n_train:]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [7]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [8]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

df_train.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
879,40.64354,-73.97777,89,3,62,0.71,1,189
44383,40.70666,-73.90779,30,21,0,,1,73
15394,40.76116,-73.99016,120,2,17,0.43,1,0
43230,40.70763,-74.0105,470,2,5,1.88,327,272
16332,40.79658,-73.93287,199,2,30,0.8,1,30


In [9]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

In [12]:
del df_train['price']
del df_val['price']
del df_test['price']

In [13]:
len(y_train)

29337

We need to deal with missing values for the column from Q1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lesssons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?

In [14]:
# df.fillna(value = 0, inplace = True)
df.isnull().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [15]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [16]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [17]:
rpm_mean = df['reviews_per_month'].mean()
rpm_mean 

1.3732214298586884

In [18]:
x_train_0 = df_train.fillna({'reviews_per_month':0})
x_train_mean = df_train.fillna({'reviews_per_month': rpm_mean})
w0, w = train_linear_regression(x_train_0, y_train)
w0, w = train_linear_regression(x_train_mean, y_train)


In [19]:
x_val_0 = df_val.fillna({'reviews_per_month':0})
x_val_mean = df_val.fillna({'reviews_per_month': rpm_mean})
y_pred_0 = w0 + x_val_0.dot(w)
y_pred_mean = w0 + x_val_mean.dot(w)


In [20]:
rmse_with_mean = rmse(y_val, y_pred_mean)
rmse_with_0 = rmse(y_val, y_pred_0)
print("RMSE with fill mean:",round(rmse_with_mean,2))
print("RMSE with fill zero:",round(rmse_with_0,2))

RMSE with fill mean: 0.64
RMSE with fill zero: 0.64


Now let's train a regularized linear regression.
For this question, fill the NAs with 0.
Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
Use RMSE to evaluate the model on the validation dataset.
Round the RMSE scores to 2 decimal digits.
Which r gives the best RMSE?
If there are multiple options, select the smallest r.

In [21]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0, w = train_linear_regression_reg(x_train_0, y_train, r=r)
    y_pred = w0 + x_val_0.dot(w)
    
    print('When r is {}, the RMSE value is {}'.format(r,round(rmse(y_val, y_pred),2)))    

When r is 0, the RMSE value is 0.64
When r is 1e-06, the RMSE value is 0.64
When r is 0.0001, the RMSE value is 0.64
When r is 0.001, the RMSE value is 0.64
When r is 0.01, the RMSE value is 0.66
When r is 0.1, the RMSE value is 0.68
When r is 1, the RMSE value is 0.68
When r is 5, the RMSE value is 0.68
When r is 10, the RMSE value is 0.68


We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.
For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))

In [22]:
score = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]]
    df_test = df.iloc[idx[n_train + n_val:]]

    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    

    y_train = np.log1p(df_train.price.values)
    y_val = np.log1p(df_val.price.values)
    y_test = np.log1p(df_test.price.values)

    del df_train['price']
    del df_val['price']
    del df_test['price']

    x_train_0 = df_train.fillna({'reviews_per_month':0})
    w0, w = train_linear_regression(x_train_0, y_train)

    x_val_0 = df_val.fillna({'reviews_per_month':0})
    y_pred_0 = w0 + x_val_0.dot(w)

    rmse_with_0 = rmse(y_val, y_pred_0)

    print('Seed: ', seed, 'RMSE: ', rmse_with_0)
    score.append(rmse_with_0)

print('Standard Deviation Of All The Scores: ', round(np.std(score),3))

Seed:  0 RMSE:  0.6430337788516721
Seed:  1 RMSE:  0.6549779960786503
Seed:  2 RMSE:  0.6462523685788484
Seed:  3 RMSE:  0.6476558177033449
Seed:  4 RMSE:  0.637514507042558
Seed:  5 RMSE:  0.644580908331267
Seed:  6 RMSE:  0.6305809996814798
Seed:  7 RMSE:  0.629785191803214
Seed:  8 RMSE:  0.6506184266456353
Seed:  9 RMSE:  0.6489780353287041
Standard Deviation Of All The Scores:  0.008


Split the dataset like previously, use seed 9.
Combine train and validation datasets.
Fill the missing values with 0 and train a model with r=0.001.
What's the RMSE on the test dataset?

In [23]:
np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

df_full_train = pd.concat([df_train, df_val])
df_full_train.reset_index(drop = True)

# df_full_train = df_full_train.fillna(0)
# df_test = df_test.fillna(0)

def prepare_X(df, val):
    df = df.copy()

    df = df.fillna(val)
    X = df.values
    
    return X

x_full_train = prepare_X(df_full_train, 0)
y_full_train = np.concatenate([y_train, y_val])

w0, w = train_linear_regression_reg(x_full_train, y_full_train, 0.001)

x_test = prepare_X(df_test, 0)

y_pred = w0 + x_test.dot(w)

score = rmse(y_test, y_pred)

print('The RMSE on the test dataset is:', score)

The RMSE on the test dataset is: 0.645277134845326
