In [128]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
!pip install fast_ml
!pip install sklearn

--2022-09-15 21:15:25--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv.2’


2022-09-15 21:15:25 (19.7 MB/s) - ‘housing.csv.2’ saved [1423529/1423529]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge
from fast_ml.model_development import train_valid_test_split
from math import sqrt
%matplotlib inline

In [130]:
df=pd.read_csv('housing.csv')
df=pd.get_dummies(df)
df.median_house_value=np.log1p(df.median_house_value)

print(f'Column with most Nans:\n {df.isna().sum().sort_values(ascending=False).head(1)}\n\n')
print(f'Median of Population: {df.population.median()}')


Column with most Nans:
 total_bedrooms    207
dtype: int64


Median of Population: 1166.0


In [131]:
df_mean=df.copy()
df_median=df.copy()
df_zero=df.copy()

df_mean['total_bedrooms_median']=df.total_bedrooms.fillna(value=df.total_bedrooms.mean())
df_median['total_bedrooms_mean']=df.total_bedrooms.fillna(value=df.total_bedrooms.median())
df_zero['total_bedrooms_zero']=df.total_bedrooms.fillna(value=0)

df_mean.drop(columns=['total_bedrooms'],axis=1,inplace=True)
df_median.drop(columns=['total_bedrooms'],axis=1,inplace=True)
df_zero.drop(columns=['total_bedrooms'],axis=1,inplace=True)

X_train_mean, y_train_mean, X_valid_mean, y_valid_mean, X_test_mean, y_test_mean = train_valid_test_split(df_mean, target='median_house_value',train_size=0.6, valid_size=0.2, test_size=0.2,random_state=42)
X_train_median, y_train_median, X_valid_median, y_valid_median, X_test_median, y_test_median = train_valid_test_split(df_median,'median_house_value',train_size=0.6, valid_size=0.2, test_size=0.2,random_state=42)
X_train_zero, y_train_zero, X_valid_zero, y_valid_zero, X_test_zero, y_test_zero = train_valid_test_split(df_zero,'median_house_value',train_size=0.6, valid_size=0.2, test_size=0.2,random_state=42)

model_mean=LinearRegression()
model_median=LinearRegression()
model_zero=LinearRegression()

model_mean.fit(X_train_mean,y_train_mean)
y_pred_mean=model_mean.predict(X_test_mean)
print(f'rmse with nans replaced with Mean:{round(sqrt(mean_squared_error(y_test_mean, y_pred_mean)),5)}')

model_median.fit(X_train_median,y_train_median)
y_pred_median=model_median.predict(X_test_median)
print(f'rmse with nans replace with Median: {round(sqrt(mean_squared_error(y_test_median, y_pred_median)),5)}')

model_zero.fit(X_train_zero,y_train_zero)
y_pred_zero=model_zero.predict(X_test_zero)
print(f'rmse with nans replace with 0: {round(sqrt(mean_squared_error(y_test_zero, y_pred_zero)),5)}')

#replacing nans with median is slightly better than other two options.


rmse with nans replaced with Mean:0.33229
rmse with nans replace with Median: 0.33224
rmse with nans replace with 0: 0.33362


In [133]:
score={}
r = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

for alpha in r:
  model_ridge=Ridge(alpha)
  model_ridge.fit(X_train_zero,y_train_zero)
  y_pred=model_ridge.predict(X_test_zero)
  score[alpha]=round(sqrt(mean_squared_error(y_test_zero, y_pred)),3)
  print(f'rmse at r = {alpha} ---> {score[alpha]}')

  #lambda at 0 indicates ridge regression has no impact here

rmse at r = 0 ---> 0.333
rmse at r = 1e-06 ---> 0.334
rmse at r = 0.0001 ---> 0.334
rmse at r = 0.001 ---> 0.334
rmse at r = 0.01 ---> 0.334
rmse at r = 0.1 ---> 0.334
rmse at r = 1 ---> 0.334
rmse at r = 5 ---> 0.334
rmse at r = 10 ---> 0.334


In [137]:
dff=df.copy()
states=[x for x in range(10)]
dff['total_bedrooms_zero']=df.total_bedrooms.fillna(value=0)
dff.drop(columns=['total_bedrooms'],axis=1,inplace=True)
rmse=[]

for s in states:
  X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(dff,'median_house_value',train_size=0.6, valid_size=0.2, test_size=0.2,random_state=s)
  model=LinearRegression()
  model.fit(X_train,y_train)
  y_pred_=model.predict(X_test)
  rmse.append(sqrt(mean_squared_error(y_test, y_pred_)))

round(np.std(rmse),3)


0.006

In [138]:
#The train test and valid splits are already done in previous questions at random state=9
r=0.001

#combining test and validation sets
X=pd.concat([X_train,X_valid],axis=0)
y=pd.concat([y_train,y_valid],axis=0)

#Lasso (L1) regularisation here, with alpha set to 0.001
model_final=Lasso(r)
model_final.fit(X,y)
Y_pred=model_final.predict(X_test)

print(sqrt(mean_squared_error(y_test, Y_pred)))




0.3356214915705247
