In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("housing.csv")

In [84]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [60]:
df=df[['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']]

In [61]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


## Q1 solution

In [62]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

## Q2 solution

In [63]:
df.population.quantile(0.5)

1166.0

In [73]:
df.total_bedrooms.mean()

537.8705525375618

In [16]:
def prepare_datax(df, seedval,fillnavalue):
    
    df=df[['latitude',
            'longitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income',
            'median_house_value']]
    # Splitting of dataset into the train 60%, validation 20%, and the test 20 % dataset
    df = df.copy()
    n = len(df)
    n_val = int(n * 0.2)            # Validation dataset
    n_test = int(n * 0.2)           # Test dataset 
    n_train = n - n_val - n_test    # Train dataset
    
    #filling with zero in missing values
    
    #print(df.total_bedrooms.value_counts())
    df["total_bedrooms"] = df["total_bedrooms"].fillna(fillnavalue)
    #print(df.total_bedrooms.value_counts())
    
    # suffle index
    idx = np.arange(n)
    np.random.seed(seedval)
    np.random.shuffle(idx)
    
    #get train/val/test dataset
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    #reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    
    # Transformation of the y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    return df_train,df_val,df_test,y_train,y_val,y_test


In [17]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [18]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [26]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [33]:
def run_main(df, seed,fillnavalue,regularization=False,r=0.0):
    df_train,df_val,df_test,y_train,y_val,y_test = prepare_datax(df,seed,fillnavalue)
    if(regularization==False):
        w_0, w = train_linear_regression(df_train.values,y_train)
    else:
        print("R:",r)
        w_0, w = train_linear_regression_reg(df_train.values,y_train, r)
    y_valid_pred = w_0 + df_val.values.dot(w)
    return rmse(y_val,y_valid_pred)
    
    

## Q3 Solution

In [34]:
#fill na with zero
seed=42
print("seed used:",seed)
fillvalue=0
print("missing value fill with :",fillvalue)
print("RMSE:",run_main(df,seed,fillvalue,False,0.0))
print("Round of RMSE:",round(run_main(df,seed,fillvalue,False,0.0), 2))

seed used: 42
missing value fill with : 0
RMSE: 0.32953303652313465
Round of RMSE: 0.33


In [35]:
#fill na with mean
seed=42
print("seed used:",seed)
fillvalue=int(df.total_bedrooms.mean())
print("missing value fill with :",fillvalue)
print("RMSE:",run_main(df,seed,fillvalue,False,0.0))
print("Round of RMSE:",round(run_main(df,seed,fillvalue,False,0.0), 2))

seed used: 42
missing value fill with : 537
RMSE: 0.3290194136277196
Round of RMSE: 0.33


## Q4 solution

In [36]:
#fill na with zero
seed=42
print("seed used:",seed)
fillvalue=0
print("missing value fill with :",fillvalue)

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    RMSE=run_main(df,seed,fillvalue,True,r)
    print("RMSE:",RMSE)
    print("Round of RMSE:",round(RMSE, 2))

seed used: 42
missing value fill with : 0
R: 0
RMSE: 0.32953303652313465
Round of RMSE: 0.33
R: 1e-06
RMSE: 0.3295330361647731
Round of RMSE: 0.33
R: 0.0001
RMSE: 0.329533000970407
Round of RMSE: 0.33
R: 0.001
RMSE: 0.3295327038672534
Round of RMSE: 0.33
R: 0.01
RMSE: 0.32953193659894686
Round of RMSE: 0.33
R: 0.1
RMSE: 0.32969472053996735
Round of RMSE: 0.33
R: 1
RMSE: 0.33378872200041393
Round of RMSE: 0.33
R: 5
RMSE: 0.33924853455042614
Round of RMSE: 0.34
R: 10
RMSE: 0.34060638078084016
Round of RMSE: 0.34


## Q5 solution

In [51]:

#fill na with zero and seed variable

fillvalue=0
print("Missing value fill with :",fillvalue,"\n")

RMSE_VALUE_list=[]
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    RMSE=run_main(df,seed,fillvalue,False,r=0.0)
    print("SEED:",seed)
    print("RMSE:",RMSE)
    print("Round of RMSE:",round(RMSE, 3))
    RMSE_VALUE_list.append(RMSE)
    print("\n")
print("ALL RMSE",RMSE_VALUE_list)
print("Standard deviation",np.std(RMSE_VALUE_list))

Missing value fill with : 0 

SEED: 0
RMSE: 0.33884304805295884
Round of RMSE: 0.339


SEED: 1
RMSE: 0.3362387255955875
Round of RMSE: 0.336


SEED: 2
RMSE: 0.33209123188440265
Round of RMSE: 0.332


SEED: 3
RMSE: 0.3405153609035516
Round of RMSE: 0.341


SEED: 4
RMSE: 0.33890240665726906
Round of RMSE: 0.339


SEED: 5
RMSE: 0.34348667257195153
Round of RMSE: 0.343


SEED: 6
RMSE: 0.34519809530989876
Round of RMSE: 0.345


SEED: 7
RMSE: 0.33959899274043825
Round of RMSE: 0.34


SEED: 8
RMSE: 0.3466230873192809
Round of RMSE: 0.347


SEED: 9
RMSE: 0.3365926124192126
Round of RMSE: 0.337


ALL RMSE [0.33884304805295884, 0.3362387255955875, 0.33209123188440265, 0.3405153609035516, 0.33890240665726906, 0.34348667257195153, 0.34519809530989876, 0.33959899274043825, 0.3466230873192809, 0.3365926124192126]
Standard deviation 0.004170771946558344


## Q6 solution : 0.3453149507173144

In [53]:


seedval=9
fillnavalue=0
r=0.001

df=df[['latitude',
        'longitude',
        'housing_median_age',
        'total_rooms',
        'total_bedrooms',
        'population',
        'households',
        'median_income',
        'median_house_value']]

# Splitting of dataset into the train 60%, validation 20%, and the test 20 % dataset
df = df.copy()
n = len(df)
n_val = int(n * 0.2)            # Validation dataset
n_test = int(n * 0.2)           # Test dataset 
n_train = n - n_val - n_test    # Train dataset

#filling with zero in missing values

#print(df.total_bedrooms.value_counts())
df["total_bedrooms"] = df["total_bedrooms"].fillna(fillnavalue)
#print(df.total_bedrooms.value_counts())

# suffle index
idx = np.arange(n)
np.random.seed(seedval)
np.random.shuffle(idx)

#get train/val/test dataset
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

#reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


# Transformation of the y values
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


In [59]:
#combine train and val data
df_train_comb=df_train.append(df_val)
y_train_comb= np.concatenate((y_train, y_val))


In [60]:
len(df_train_comb),len(df_train),len(df_val)

(16512, 12384, 4128)

In [61]:
len(y_train_comb),len(y_train),len(y_val)

(16512, 12384, 4128)

In [62]:
w_0, w = train_linear_regression(df_train_comb.values,y_train_comb)

In [63]:
y_test_pred = w_0 + df_test.values.dot(w)

In [64]:
rmse(y_test,y_test_pred)

0.3453149507173144