## California Housing Prices 

* Data link https://www.kaggle.com/datasets/camnugent/california-housing-prices 

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = 'data/housing.csv'
df = pd.read_csv(data)

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.shape

(20640, 10)

## Data preparation

* Select only the features from above and fill in the missing values with 0.
* Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
* Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
* Create a new column population_per_household by dividing the column population by the column households from dataframe.

In [5]:
features = ['latitude', 'longitude', 'housing_median_age',
            'total_rooms', 'total_bedrooms', 'population',
            'households', 'median_income', 'median_house_value',
            'ocean_proximity']

In [6]:
df = df[features]

In [7]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Only column 'total_bedrooms' contains missing values. Fill missing values by 0

In [8]:
df = df.fillna(0)

In [9]:
df['rooms_per_household'] = df.total_rooms / df.households

In [10]:
df['population_per_household'] = df.population / df.households

In [11]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,2.181467


## Question 1

* What is the most frequent observation (mode) for the column ocean_proximity?

In [12]:
ocean_proximity_mode = df.ocean_proximity.mode()[0]
ocean_proximity_mode

'<1H OCEAN'

## Split the data

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value (median_house_value) is not in your dataframe.

In [13]:
from sklearn.model_selection import train_test_split 

In [14]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state=42)

In [15]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)

In [16]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = df_train.median_house_value
y_val = df_val.median_house_value
y_test = df_test.median_house_value

In [19]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [20]:
df_train.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
0,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,3.754011
1,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,2.789082
2,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,2.540059
3,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,2.059016
4,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,2.516456


## Question 2

* Create the correlation matrix for the numerical features of your train dataset.
* In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

In [21]:
numerical = ['latitude', 'longitude', 'housing_median_age',
            'total_rooms', 'total_bedrooms', 'population',
            'households', 'median_income']

In [30]:
num_numerical = len(numerical)
corr_matrix = np.zeros((num_numerical, num_numerical))
for i in range(num_numerical):
    for j in range(num_numerical):
        corr_matrix[i, j] = df_train[numerical[i]].corr(df_train[numerical[j]])

In [34]:
position = abs(corr_matrix - np.eye(num_numerical)).argmax()

In [35]:
numerical[position // num_numerical], numerical[position % num_numerical]

('total_bedrooms', 'households')

In [36]:
# another approach 
df_train[numerical].corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0


In [37]:
corr_matr = df_train[numerical].corr().abs().unstack()
corr_matr[corr_matr == 1] = 0
corr_matr.sort_values(ascending = False).drop_duplicates().head(1)

households  total_bedrooms    0.979399
dtype: float64

## Make median_house_value binary
* We need to turn the median_house_value variable from numeric into binary.
* Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [39]:
mean_med_val = df.median_house_value.mean()
above_average_train = (y_train > mean_med_val).astype(int)
above_average_val = (y_val > mean_med_val).astype(int)
above_average_test = (y_test > mean_med_val).astype(int)

In [41]:
mean_med_val

206855.81690891474

In [44]:
y_train.mean()

206807.7419250646

## Question 3

* Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
* What is the value of mutual information?
* Round it to 2 decimal digits using round(score, 2)

In [45]:
from sklearn.metrics import mutual_info_score

In [46]:
categorical = ['ocean_proximity']

In [47]:
score = mutual_info_score(df_train[categorical[0]], above_average_train)
round(score, 2)

0.1

## Question 4

* Now let's train a logistic regression
* Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
* Fit the model on the training dataset.
* To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [49]:
from sklearn.feature_extraction import DictVectorizer

In [50]:
train_dicts = df_train[numerical + categorical].to_dict(orient='records')

In [51]:
train_dicts[0]

{'latitude': 34.43,
 'longitude': -119.67,
 'housing_median_age': 39.0,
 'total_rooms': 1467.0,
 'total_bedrooms': 381.0,
 'population': 1404.0,
 'households': 374.0,
 'median_income': 2.3681,
 'ocean_proximity': '<1H OCEAN'}

In [52]:
dv = DictVectorizer(sparse=False)

In [53]:
X_train = dv.fit_transform(train_dicts)

In [54]:
X_train

array([[ 374.  ,   39.  ,   34.43, ..., 1404.  ,  381.  , 1467.  ],
       [ 806.  ,   24.  ,   33.74, ..., 2248.  ,  794.  , 6097.  ],
       [ 337.  ,   41.  ,   39.13, ...,  856.  ,  309.  , 1317.  ],
       ...,
       [ 602.  ,   18.  ,   32.74, ..., 1952.  ,  611.  , 3341.  ],
       [ 350.  ,   16.  ,   33.84, ...,  730.  ,  354.  , 1545.  ],
       [ 215.  ,   35.  ,   33.91, ...,  640.  ,  197.  ,  940.  ]])

In [55]:
dv.get_feature_names_out()

array(['households', 'housing_median_age', 'latitude', 'longitude',
       'median_income', 'ocean_proximity=<1H OCEAN',
       'ocean_proximity=INLAND', 'ocean_proximity=ISLAND',
       'ocean_proximity=NEAR BAY', 'ocean_proximity=NEAR OCEAN',
       'population', 'total_bedrooms', 'total_rooms'], dtype=object)

In [64]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [65]:
model.fit(X_train, above_average_train)

In [86]:
model.classes_

array([0, 1])

In [66]:
model.coef_[0].round(3)

array([ 0.004,  0.036,  0.12 ,  0.088,  1.205,  0.472, -1.744,  0.018,
        0.297,  0.874, -0.002,  0.002, -0.   ])

In [67]:
model.intercept_[0]

-0.08287204140687451

Validation of the model

In [80]:
val_dicts = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [81]:
y_pred = model.predict(X_val)

In [87]:
original_accuracy = (above_average_val == y_pred).mean()
round(original_accuracy, 2)

0.84

## Question 5

* Let's find the least useful feature using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
* Which of following feature has the smallest difference?

total_rooms
total_bedrooms
population
households


In [85]:
features_list = ['households', 'housing_median_age', 'latitude', 'longitude',
       'median_income', 'ocean_proximity=<1H OCEAN',
       'ocean_proximity=INLAND', 'ocean_proximity=ISLAND',
       'ocean_proximity=NEAR BAY', 'ocean_proximity=NEAR OCEAN',
       'population', 'total_bedrooms', 'total_rooms']

In [91]:
accuracy_score = np.zeros(len(features_list))
for i in range(len(features_list)):
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    X_train_part = np.delete(X_train, i, 1)
    X_val_part = np.delete(X_val, i, 1)
    model.fit(X_train_part, above_average_train)
    y_pred = model.predict(X_val_part)
    acc_score = (above_average_val == y_pred).mean()
    accuracy_score[i] = acc_score

In [103]:
accuracy_difference = abs(accuracy_score - original_accuracy)
accuracy_difference

array([0.00436047, 0.00557171, 0.00169574, 0.00193798, 0.05256783,
       0.00072674, 0.00193798, 0.00072674, 0.0004845 , 0.00072674,
       0.01550388, 0.00072674, 0.00096899])

In [121]:
list_of_diffs = np.array(list(zip(features_list, accuracy_difference)))

In [116]:
chosen = ['total_rooms', 'total_bedrooms', 'population', 'households']
ind = []
for i in chosen:
    ind.append(features_list.index(i))

In [127]:
chosen_diffs = list_of_diffs[ind]

In [128]:
chosen_diffs[chosen_diffs[:, 1].argsort()]

array([['total_bedrooms', '0.0007267441860464574'],
       ['total_rooms', '0.0009689922480620172'],
       ['households', '0.0043604651162790775'],
       ['population', '0.015503875968992276']], dtype='<U32')

## Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
* This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [134]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [132]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)
y_test_log = np.log1p(y_test)

In [135]:
alphas = [0, 0.01, 0.1, 1, 10]
rmse = []
for alpha in alphas:
    model = Ridge(alpha=alpha, solver="sag", random_state=42)
    model.fit(X_train, y_train_log)
    y_pred = model.predict(X_val)
    rmse.append( mean_squared_error(y_val, y_pred))

In [143]:
rmse

[57138138687.31905,
 57138138687.31905,
 57138138687.31913,
 57138138687.319855,
 57138138687.32708]

As we can see that the least rmse is when alpha = 0 or alpha = 0.01

In [144]:
alphas[np.array(rmse).argmin()]

0