In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
SEED = 42

In [3]:
df = pd.read_csv("./housing.csv")

In [4]:
def remove_spaces(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    string_columns = list(df.dtypes[df.dtypes == 'object'].index)

    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ', '_')
    return df

In [5]:
columns = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity'
]

In [6]:
df = df[columns]

In [7]:
df = remove_spaces(df)

In [8]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,near_bay
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,near_bay
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,near_bay
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,near_bay
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,near_bay


In [9]:
df.fillna(0, inplace = True)

In [10]:
df['rooms_per_household'] = df.total_rooms/df.households
df['bedrooms_per_room'] = df.total_bedrooms/df.total_rooms
df['population_per_household'] = df.population/df.households

In [11]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,near_bay,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,near_bay,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,near_bay,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,near_bay,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,near_bay,6.281853,0.172096,2.181467


Question 1: Most frequent value for ocean_proximity

In [12]:
df.ocean_proximity.mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

Question 2
- Create the correlation matrix for the numerical features of your train dataset.
    - In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_org = df_train.median_house_value.values
y_val_org = df_val.median_house_value.values
y_test_org = df_test.median_house_value.values

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [16]:
df_train.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


- total_bedrooms and households - 0.979399
- total_bedrooms and total_rooms - 0.931546	
- population and households - 0.906841	
- population_per_household and total_rooms - -0.029452	

Question 3
- Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
- What is the value of mutual information?
- Round it to 2 decimal digits using round(score, 2)

In [17]:
data_mean = y_train.mean()
data_mean

206807.7419250646

In [18]:
y_train = [1 if i > data_mean else 0 for i in y_train]

In [19]:
y_val = [1 if i > data_mean else 0 for i in y_val]

In [20]:
y_test = [1 if i > data_mean else 0 for i in y_test]

In [21]:
from sklearn.metrics import mutual_info_score

In [22]:
mutual_info_score(y_train, df_train.ocean_proximity)

0.10138385763624205

Question 4
- Now let's train a logistic regression
- Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [23]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [24]:
cat_cols = ['ocean_proximity']

In [25]:
cont_cols = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household', 
    'bedrooms_per_room',
    'population_per_household'
]

In [26]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[cat_cols + cont_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat_cols + cont_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [27]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=SEED)

In [28]:
model.fit(X_train, y_train)

In [29]:
def get_accuracy(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    above_average_decision = (y_pred >= 0.5)
    return (y == above_average_decision).mean()

In [30]:
get_accuracy(model, X_train, y_train)

0.8296188630490956

In [31]:
get_accuracy(model, X_val, y_val)

0.8359980620155039

Question 5: Feature selection - the smallest difference in accuracy
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?
    - total_rooms
    - total_bedrooms
    - population
    - households

In [32]:
remove_features = ['total_rooms', 'total_bedrooms', 'population', 'households']

In [33]:
for f in remove_features:
    temp_conts = cont_cols.copy()
    temp_conts.remove(f)

    dv = DictVectorizer(sparse=False)

    train_dict = df_train[cat_cols + temp_conts].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[cat_cols + temp_conts].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=SEED)
    model.fit(X_train, y_train)

    model_accuracy = get_accuracy(model, X_val, y_val)
    original_accuracy = 0.8340600775193798

    difference = original_accuracy - model_accuracy

    print(f'Feature removed: {f}, Accuracy Difference: {difference}')

Feature removed: total_rooms, Accuracy Difference: -0.0021802325581395943
Feature removed: total_bedrooms, Accuracy Difference: -0.0031492248062016115
Feature removed: population, Accuracy Difference: 0.007751937984496138
Feature removed: households, Accuracy Difference: 0.0


If removing a feature improves accuracy i.e. model_accuracy > original_accuracy then that feature is not useful.

Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
<br>If there are multiple options, select the smallest alpha.

In [37]:
y_train = np.log1p(y_train_org)
y_val = np.log1p(y_val_org)
y_test = np.log1p(y_test_org)

In [42]:
from sklearn.linear_model import Ridge

In [43]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[cat_cols + cont_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat_cols + cont_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [45]:
def rmse(y_pred, y):
    return np.sqrt(np.mean((y_pred - y) ** 2))

In [51]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=SEED)
    model.fit(X_train, y_train)
    rmse_score = rmse(model.predict(X_val), y_val)
    print(f'Alpha: {a}, RMSE: {rmse_score}')

Alpha: 0, RMSE: 0.524063570701514
Alpha: 0.01, RMSE: 0.524063570718629
Alpha: 0.1, RMSE: 0.5240635708812071
Alpha: 1, RMSE: 0.5240635725155536
Alpha: 10, RMSE: 0.5240635888333284
