In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Dataset
In this homework, we will use the California Housing Prices from Kaggle.

Here's a wget-able link:

wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

We'll keep working with the 'median_house_value' variable, and we'll transform it to a classification task.

## Features
For the rest of the homework, you'll need to use only these columns:
-   `'latitude'`,
-   `'longitude'`,
-   `'housing_median_age'`,
-   `'total_rooms'`,
-   `'total_bedrooms'`,
-   `'population'`,
-   `'households'`,
-   `'median_income'`,
-   `'median_house_value'`,
-   `'ocean_proximity'`,

## Data preparation
-   Select only the features from above and fill in the missing values with 0.
-   Create a new column `rooms_per_household` by dividing the column `total_rooms` by the column `households` from dataframe.
-   Create a new column `bedrooms_per_room` by dividing the column `total_bedrooms` by the column `total_rooms` from dataframe.
-   Create a new column `population_per_household` by dividing the column `population` by the column `households` from dataframe.

In [2]:
df_housing = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
print(df_housing.shape)
df_housing.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
features = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
    'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'
]
df = df_housing[features]
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.isna().any()

latitude              False
longitude             False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [6]:
df_filled = df.fillna(0)
df_filled.isna().any().sum()

0

In [7]:
df_filled['rooms_per_household'] = df_filled.total_rooms/df_filled.households
df_filled['bedrooms_per_room'] = df_filled.total_bedrooms/df_filled.total_rooms
df_filled['populations_per_household'] = df_filled.population/df_filled.households
df_filled.head().T

Unnamed: 0,0,1,2,3,4
latitude,37.88,37.86,37.85,37.85,37.85
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


## Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

Options:
-   `NEAR BAY`
-   `<1H OCEAN`
-   `INLAND`
-   `NEAR OCEAN`

In [8]:
df_filled.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

## Make median_house_value binary

-   We need to turn the `median_house_value` variable from numeric into binary.
-   Let's create a variable `above_average` which is 1 if the `median_house_value` is above its mean value and 0 otherwise.

In [9]:
average = df_filled.median_house_value.mean()
above_average = (df_filled.median_house_value > average).astype('int64')
df_classify = df_filled.copy()
df_classify.median_house_value = above_average
df_classify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   latitude                   20640 non-null  float64
 1   longitude                  20640 non-null  float64
 2   housing_median_age         20640 non-null  float64
 3   total_rooms                20640 non-null  float64
 4   total_bedrooms             20640 non-null  float64
 5   population                 20640 non-null  float64
 6   households                 20640 non-null  float64
 7   median_income              20640 non-null  float64
 8   median_house_value         20640 non-null  int64  
 9   ocean_proximity            20640 non-null  object 
 10  rooms_per_household        20640 non-null  float64
 11  bedrooms_per_room          20640 non-null  float64
 12  populations_per_household  20640 non-null  float64
dtypes: float64(11), int64(1), object(1)
memory usa

## Split the data

-   Split your data in train/val/test sets, with 60%/20%/20% distribution.
-   Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
-   Make sure that the target value (median_house_value) is not in your dataframe.

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df_full_train, df_test = train_test_split(df_classify, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(len(df_train), len(df_valid), len(df_test))

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train.head().T

12384 4128 4128


Unnamed: 0,0,1,2,3,4
latitude,34.43,33.74,39.13,34.24,37.52
longitude,-119.67,-118.32,-121.62,-118.63,-122.3
housing_median_age,39.0,24.0,41.0,9.0,38.0
total_rooms,1467.0,6097.0,1317.0,4759.0,2769.0
total_bedrooms,381.0,794.0,309.0,924.0,387.0
population,1404.0,2248.0,856.0,1884.0,994.0
households,374.0,806.0,337.0,915.0,395.0
median_income,2.3681,10.1357,1.6719,4.8333,5.5902
median_house_value,1,1,0,1,1
ocean_proximity,<1H OCEAN,NEAR OCEAN,INLAND,<1H OCEAN,NEAR OCEAN


In [12]:
y_train = df_train.median_house_value.values
y_valid = df_valid.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_valid['median_house_value']
del df_test['median_house_value']

## Question 2
-   Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your train dataset.
    -   In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
-   What are the two features that have the biggest correlation in this dataset?

Options:

-   `total_bedrooms` and `households`
-   `total_bedrooms` and `total_rooms`
-   `population` and `households`
-   `population_per_household` and `total_rooms`

In [13]:
numeric = list(df_train.columns)
numeric.remove('ocean_proximity')
categorical = ['ocean_proximity']

In [14]:
df_train_num = df_train[numeric]
df_train_num.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,populations_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


## Question 3
-   Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
-   What is the value of mutual information?
-   Round it to 2 decimal digits using round(score, 2)

Options:
-   0.26
-   0
-   0.10
-   0.16


In [15]:
from sklearn.metrics import mutual_info_score

In [16]:
round(mutual_info_score(df_train.ocean_proximity, y_train), 2)

0.1

## Question 4

-   Now let's train a logistic regression
-   Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
-   Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)`
-   Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

Options:
-   0.60
-   0.72
-   0.84
-   0.95

In [17]:
from sklearn.feature_extraction import DictVectorizer

In [18]:
dv = DictVectorizer(sparse=False)

train_features = numeric+categorical

dict_train = df_train[train_features].to_dict(orient='records')
X_train = dv.fit_transform(dict_train)

dict_valid = df_valid[train_features].to_dict(orient='records')
X_valid = dv.fit_transform(dict_valid)

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [21]:
lr_score = model.score(X_valid, y_valid)
round(lr_score,2)

0.84

## Question 5
-   Let's find the least useful feature using the feature elimination technique.
-   Train a model with all these features (using the same parameters as in Q4).
-   Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
-   For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
-   Which of following feature has the smallest difference?
    -   total_rooms
    -   total_bedrooms
    -   population
    -   households

Note: the difference doesn't have to be positive


In [22]:
score_diffs = {}

for feature in train_features:
    
    sub_features = train_features.copy()
    sub_features.remove(feature)
    
    dict_train = df_train[sub_features].to_dict(orient='records')
    X_train = dv.fit_transform(dict_train)
    
    dict_valid = df_valid[sub_features].to_dict(orient='records')
    X_valid = dv.fit_transform(dict_valid)
    
    model.fit(X_train, y_train)
    score_diffs[feature] = model.score(X_valid, y_valid) - lr_score
    
sorted_score_diffs = dict(sorted(score_diffs.items(), key=lambda item: abs(item[1])))
pd.Series(sorted_score_diffs)

populations_per_household   -0.000242
total_rooms                  0.000242
bedrooms_per_room            0.000242
rooms_per_household         -0.000727
total_bedrooms               0.001211
households                  -0.001938
latitude                    -0.003634
longitude                   -0.004118
housing_median_age          -0.004360
population                  -0.009690
ocean_proximity             -0.015746
median_income               -0.050630
dtype: float64

## Question 6
-   For this question, we'll see how to use a linear regression model from Scikit-Learn
-   We'll need to use the original column `'median_house_value'`. Apply the logarithmic transformation to this column.
-   Fit the Ridge regression model (`model = Ridge(alpha=a, solver="sag", random_state=42)`) on the training data.
-   This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
-   Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
If there are multiple options, select the smallest alpha.

Options:
-   0
-   0.01
-   0.1
-   1
-   10

In [23]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [24]:
df_full_train, df_test = train_test_split(df_filled, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(len(df_train), len(df_valid), len(df_test))

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train.head().T

12384 4128 4128


Unnamed: 0,0,1,2,3,4
latitude,34.43,33.74,39.13,34.24,37.52
longitude,-119.67,-118.32,-121.62,-118.63,-122.3
housing_median_age,39.0,24.0,41.0,9.0,38.0
total_rooms,1467.0,6097.0,1317.0,4759.0,2769.0
total_bedrooms,381.0,794.0,309.0,924.0,387.0
population,1404.0,2248.0,856.0,1884.0,994.0
households,374.0,806.0,337.0,915.0,395.0
median_income,2.3681,10.1357,1.6719,4.8333,5.5902
median_house_value,241400.0,500001.0,64100.0,277200.0,417000.0
ocean_proximity,<1H OCEAN,NEAR OCEAN,INLAND,<1H OCEAN,NEAR OCEAN


In [25]:
y_train = np.log1p(df_train.median_house_value)
y_valid = np.log1p(df_valid.median_house_value)
y_test = np.log1p(df_test.median_house_value)

del df_train['median_house_value']
del df_valid['median_house_value']
del df_test['median_house_value']

In [26]:
dv = DictVectorizer(sparse=False)

train_features = numeric+categorical

dict_train = df_train[train_features].to_dict(orient='records')
X_train = dv.fit_transform(dict_train)

dict_valid = df_valid[train_features].to_dict(orient='records')
X_valid = dv.fit_transform(dict_valid)

In [27]:
alphas = [0, 0.01, 0.1, 1, 10]
rmse = {}

for a in alphas:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmse[a] = round(mean_squared_error(y_valid, y_pred, squared=False), 3)

rmse

{0: 0.524, 0.01: 0.524, 0.1: 0.524, 1: 0.524, 10: 0.524}