The Boston Housing Dataset

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:

CRIM - per capita crime rate by town
ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS - proportion of non-retail business acres per town.
CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
NOX - nitric oxides concentration (parts per 10 million)
RM - average number of rooms per dwelling
AGE - proportion of owner-occupied units built prior to 1940
DIS - weighted distances to five Boston employment centres
RAD - index of accessibility to radial highways
TAX - full-value property-tax rate per $10,000
PTRATIO - pupil-teacher ratio by town
B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
LSTAT - % lower status of the population
MEDV - Median value of owner-occupied homes in $1000's


In [2]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style = "darkgrid")

import warnings
warnings.filterwarnings("ignore", category = Warning)

from sklearn.datasets import load_boston
boston = load_boston()

In [4]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
col_names = boston.feature_names
df = pd.DataFrame(boston.data, columns = col_names)
df['MEDV'] = boston.target

# Make sure the data is loaded correctly.
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
df.shape

(506, 14)

In [6]:
# 2.Model Input / Output

x = df.iloc[:,:-1]
y = df["MEDV"]


In [7]:
x

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [8]:
validate_labels = df["MEDV"]
validate_labels

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

In [9]:
import random
import string
import pandas as pd
import numpy as np
import itertools
import re
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model.label_model import LabelModel

In [10]:
%pip install snorkel

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:

# Housing price range is predicted 
# low_prace range is MIDV less than 25
# and high price range is MIDV greater than 25
# based on domain knowledge rules are implemented using Labelling functions.
# the following labelling functions are examples only. 
# any numbers of rules can be defined and converted to Labelling functions.



ABSTAIN = -1
high_price = 1
low_price = 0


    
@labeling_function()
def age(record):
    if record['AGE'] > 70 :
        return low_price
    #elif record['AGE'] >50 and record['AGE'] <70:
        #return md_price
    else:
        return  high_price

In [12]:
train_df = df.iloc[:,:-1]

In [13]:
@labeling_function()
def rooms(record):
    if record['RM'] < 7:
        return low_price
    #elif record['AGE'] >50 and record['AGE'] <70:
        #return md_price
    else:
        return  high_price

In [14]:
@labeling_function()
def taxrate(record):
    if record['TAX'] < 290:
        return low_price
    #elif record['AGE'] >50 and record['AGE'] <70:
        #return md_price
    else:
        return  high_price

In [15]:
@labeling_function()
def lowerstatus(record):
    if record['LSTAT'] > 8:
        return low_price
    #elif record['AGE'] >50 and record['AGE'] <70:
        #return md_price
    else:
        return  high_price

In [16]:
train_df.head

<bound method NDFrame.head of         CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  
0       15.3  396.90   4.98  
1       1

In [17]:
lfs = [age,rooms,taxrate,lowerstatus]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train_df)
#L_validate = applier.apply(df=validate_df)

100%|██████████| 506/506 [00:00<00:00, 56290.42it/s]


In [18]:
L_train

array([[1, 0, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 0, 1],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])

In [19]:
LFAnalysis(L_train, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
age,0,"[0, 1]",1.0,1.0,0.938735
rooms,1,"[0, 1]",1.0,1.0,0.938735
taxrate,2,"[0, 1]",1.0,1.0,0.938735
lowerstatus,3,"[0, 1]",1.0,1.0,0.938735


In [20]:
label_model = LabelModel(verbose=False)
label_model.fit(L_train=L_train, n_epochs=1000, seed=100)
train_df['Labels'] = label_model.predict(L=L_train)
#preds_valid_label = label_model.predict(L=L_validate)

100%|██████████| 1000/1000 [00:00<00:00, 2667.46epoch/s]


In [21]:
train_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Labels
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,1
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,1
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,1
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,1
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,1
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,1
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,1
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,1


In [22]:
train_df['Labels'].value_counts()

0    270
1    236
Name: Labels, dtype: int64

In [23]:
train_df['Labels']

0      1
1      1
2      1
3      1
4      1
      ..
501    1
502    1
503    1
504    1
505    1
Name: Labels, Length: 506, dtype: int64