In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
cols = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population',
        'households','median_income','median_house_value','ocean_proximity']

In [5]:
df = df[cols]

In [6]:
# Select only the features from above and fill in the missing values with 0.
df = df.fillna(0)

In [7]:
# Create a new column `rooms_per_household` by dividing the column `total_rooms` by the column `households` from dataframe.
df = df.assign(rooms_per_household = df['total_rooms'] / df['households'])

In [8]:
# Create a new column `bedrooms_per_room` by dividing the column `total_bedrooms` by the column `total_rooms` from dataframe. 
df = df.assign(bedrooms_per_room = df['total_bedrooms'] / df['total_rooms'])

In [9]:
# Create a new column `population_per_household` by dividing the column `population` by the column `households` from dataframe. 
df = df.assign(population_per_household = df['population'] / df['households'])

# 1

What is the most frequent observation (mode) for the column `ocean_proximity`?

In [10]:
df['ocean_proximity'].value_counts().index[0]

'<1H OCEAN'

# 2

- Create the correlation matrix for the numerical features of your train dataset
- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [11]:
df.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [12]:
cor_matrix = df.drop(['latitude', 'longitude', 'ocean_proximity'], axis=1).corr().abs()

In [13]:
s = cor_matrix.unstack()
so = s.sort_values(kind="quicksort")
so.drop_duplicates(inplace=True)

In [14]:
so[so != 1].index[-1]

('households', 'total_bedrooms')

In [15]:
so[so != 1].index[-2]

('total_bedrooms', 'total_rooms')

# 3

Create a variable `above_average` which is `1` if the `median_house_value` is above its mean value and `0` otherwise.

Split your data in train/val/test sets, with 60%/20%/20% distribution. Use Scikit-Learn for that (the `train_test_split` function) and set the seed to 42.

Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
What is the value of mutual information?
Round it to 2 decimal digits using `round(score, 2)`

In [16]:
median_median_house_value = df['median_house_value'].median()

In [17]:
df['above_average'] = [1 if x > median_median_house_value else 0 for x in df['median_house_value']]

In [18]:
y = df['above_average']
X = df.drop(['above_average', 'median_house_value'], axis=1)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [20]:
from sklearn.metrics import mutual_info_score
round(mutual_info_score(X_train.ocean_proximity, pd.Series(y_train)), 2)

0.12

# 4

Fit the logistic regression model on the training dataset.

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [21]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_cols_train = pd.DataFrame(ohe.fit_transform(X_train[['ocean_proximity']]), columns=ohe.get_feature_names_out())
OH_cols_test = pd.DataFrame(ohe.transform(X_test[['ocean_proximity']]), columns=ohe.get_feature_names_out())
OH_cols_valid = pd.DataFrame(ohe.transform(X_val[['ocean_proximity']]), columns=ohe.get_feature_names_out())

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index
OH_cols_valid.index = X_val.index

# Remove categorical columns (will replace with one-hot encoding)
numeric_X_train = X_train.drop('ocean_proximity', axis=1)
numeric_X_test = X_test.drop('ocean_proximity', axis=1)
numeric_X_valid = X_val.drop('ocean_proximity', axis=1)

# Add one-hot encoded columns to numerical features
X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)
X_test = pd.concat([numeric_X_test, OH_cols_test], axis=1)
X_val = pd.concat([numeric_X_valid, OH_cols_valid], axis=1)

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
results = model.fit(X_train, y_train)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy = accuracy_score(y_val, results.predict(X_val))
round(accuracy, 2)

0.83

# 5

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [26]:
df_train = X_train.copy()
df_val = X_val.copy()

In [32]:
from sklearn.feature_extraction import DictVectorizer

orig_score = accuracy
features = df_train.columns.tolist()

res = []
for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    score_new = orig_score - score
    res.append(score_new)
    print(c, score_new, score)

latitude -0.0019379844961240345 0.8343023255813954
longitude 0.0004844961240310086 0.8318798449612403
housing_median_age 0.002422480620155043 0.8299418604651163
total_rooms -0.0029069767441860517 0.8352713178294574
total_bedrooms 0.0021802325581395943 0.8301841085271318
population 0.005087209302325646 0.8272771317829457
households 0.004844961240310086 0.8275193798449613
median_income 0.057412790697674465 0.7749515503875969
rooms_per_household 0.0016957364341085857 0.8306686046511628
bedrooms_per_room 0.0016957364341085857 0.8306686046511628
population_per_household 0.0009689922480620172 0.8313953488372093
ocean_proximity_<1H OCEAN 0.00024224806201555982 0.8321220930232558
ocean_proximity_INLAND 0.0007267441860465684 0.8316375968992248
ocean_proximity_ISLAND 0.0004844961240310086 0.8318798449612403
ocean_proximity_NEAR BAY 0.0021802325581395943 0.8301841085271318
ocean_proximity_NEAR OCEAN 0.0014534883720930258 0.8309108527131783


In [33]:
features[np.where(res == min(res))[0][0]]

'total_rooms'

# 6

In [50]:
df = df.assign(median_house_value = np.log10(df['median_house_value']))

y = df['above_average']
X = df.drop(['above_average', 'median_house_value'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

OH_cols_train = pd.DataFrame(ohe.fit_transform(X_train[['ocean_proximity']]), columns=ohe.get_feature_names_out())
OH_cols_test = pd.DataFrame(ohe.transform(X_test[['ocean_proximity']]), columns=ohe.get_feature_names_out())
OH_cols_valid = pd.DataFrame(ohe.transform(X_val[['ocean_proximity']]), columns=ohe.get_feature_names_out())

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index
OH_cols_valid.index = X_val.index

# Remove categorical columns (will replace with one-hot encoding)
numeric_X_train = X_train.drop('ocean_proximity', axis=1)
numeric_X_test = X_test.drop('ocean_proximity', axis=1)
numeric_X_valid = X_val.drop('ocean_proximity', axis=1)

# Add one-hot encoded columns to numerical features
X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)
X_test = pd.concat([numeric_X_test, OH_cols_test], axis=1)
X_val = pd.concat([numeric_X_valid, OH_cols_valid], axis=1)

In [52]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [58]:
scores = []
alphas = [0, 0.01, 0.1, 1, 10]
for a in alphas:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append(score)
    print(a, round(score, 3))

0 0.468
0.01 0.468
0.1 0.468
1 0.468
10 0.468


In [63]:
print('The best alpha parameter is:',  alphas[np.where(scores == min(scores))[0][0]])

The best alpha parameter is: 0
