In [1]:
import pandas as pd
import numpy as np

### Load and prepare data

In [2]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [3]:
df = pd.read_csv('housing.csv')
columns = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income',
    'median_house_value', 'ocean_proximity'
]
df = df[columns].fillna(0)

# create new features
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

In [4]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   latitude                  20640 non-null  float64
 1   longitude                 20640 non-null  float64
 2   housing_median_age        20640 non-null  float64
 3   total_rooms               20640 non-null  float64
 4   total_bedrooms            20640 non-null  float64
 5   population                20640 non-null  float64
 6   households                20640 non-null  float64
 7   median_income             20640 non-null  float64
 8   median_house_value        20640 non-null  float64
 9   ocean_proximity           20640 non-null  object 
 10  rooms_per_household       20640 non-null  float64
 11  bedrooms_per_room         20640 non-null  float64
 12  population_per_household  20640 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


In [6]:
numerical_features = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income',
    'median_house_value'
]
categorical_features = ['ocean_proximity']

### Question 1

In [7]:
df['ocean_proximity'].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

### Question 2

In [8]:
corr_matrix = df[numerical_features].corr()
corr_matrix

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,-0.14416
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.045967
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075
median_house_value,-0.14416,-0.045967,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0


In [9]:
corr_matrix.replace(1, np.nan).max().sort_values(ascending=False)

total_bedrooms        0.966507
households            0.966507
total_rooms           0.920196
population            0.907222
median_income         0.688075
median_house_value    0.688075
housing_median_age    0.105623
longitude             0.099773
latitude              0.011173
dtype: float64

### Binary transformation

In [10]:
avg_value = df['median_house_value'].mean()
print(avg_value)
df['above_average'] = (df['median_house_value'] > avg_value).astype('int')

206855.81690891474


In [11]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,1
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,1
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226,1
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,1
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,1


### Split the data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full,
                                    test_size=0.25,
                                    random_state=42)

y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

df_train.drop(['above_average', 'median_house_value'], axis=1, inplace=True)
df_val.drop(['above_average', 'median_house_value'], axis=1, inplace=True)
df_test.drop(['above_average', 'median_house_value'], axis=1, inplace=True)

### Question 3

In [14]:
from sklearn.metrics import mutual_info_score

In [15]:
mi_score = mutual_info_score(df_train['ocean_proximity'], y_train)
round(mi_score, 2)

0.1

### Question 4

In [16]:
# one-hot encoding
from sklearn.feature_extraction import DictVectorizer

train_dict = df_train.to_dict(orient='records')
train_dict[0]

{'latitude': 34.43,
 'longitude': -119.67,
 'housing_median_age': 39.0,
 'total_rooms': 1467.0,
 'total_bedrooms': 381.0,
 'population': 1404.0,
 'households': 374.0,
 'median_income': 2.3681,
 'ocean_proximity': '<1H OCEAN',
 'rooms_per_household': 3.9224598930481283,
 'bedrooms_per_room': 0.25971370143149286,
 'population_per_household': 3.7540106951871657}

In [17]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

In [18]:
# train model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear",
                           C=1.0,
                           max_iter=1000,
                           random_state=42)

model.fit(X_train, y_train)

In [19]:
# evaluate on valuation set
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
acc_score = accuracy_score(y_val, y_pred)
round(acc_score, 2)

0.84

### Question 5

In [22]:
numerical_features = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income'
]

acc_dict = {}

for f in numerical_features + categorical_features:

    train_dict = df_train.drop(f, axis=1).to_dict(orient='records')

    # preprocessing
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)

    # train model
    model = LogisticRegression(solver="liblinear",
                           C=1.0,
                           max_iter=1000,
                           random_state=42)
    model.fit(X_train, y_train)

    # evaluate on valuation set
    val_dict = df_val.drop(f, axis=1).to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    diff = np.abs(acc_score - acc)
    acc_dict[f] = diff

In [23]:
pd.DataFrame.from_dict(acc_dict, orient='index').sort_values(by=0)

Unnamed: 0,0
total_bedrooms,0.000242
longitude,0.000969
total_rooms,0.001453
latitude,0.002422
households,0.002665
housing_median_age,0.005329
population,0.00969
ocean_proximity,0.016957
median_income,0.050872


### Question 6

In [24]:
df['median_house_value_log'] = np.log1p(df['median_house_value'])

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full,
                                    test_size=0.25,
                                    random_state=42)

y_train = df_train['median_house_value_log'].values
y_val = df_val['median_house_value_log'].values
y_test = df_test['median_house_value_log'].values

df_train.drop(
    ['above_average', 'median_house_value', 
    'median_house_value_log'],
    axis=1,
    inplace=True)
df_val.drop(['above_average', 'median_house_value', 
'median_house_value_log'],
            axis=1,
            inplace=True)
df_test.drop(['above_average', 'median_house_value', 
'median_house_value_log'],
             axis=1,
             inplace=True)


In [25]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [26]:
# preprocessing
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

rmse_dict = {}

# train model
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)

    # evaluate on valuation set
    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)
    rmse = round(mean_squared_error(y_val, y_pred), 3)

    rmse_dict[a] = rmse


In [27]:
rmse_dict

{0: 0.275, 0.01: 0.275, 0.1: 0.275, 1: 0.275, 10: 0.275}