In [1]:
from pathlib import Path
import sys
import os
import re
import warnings
import scipy 
from collections import Counter
from datetime import datetime
from dateutil import relativedelta

import pandas as pd
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from configparser import ConfigParser
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


%load_ext autoreload
%autoreload 

In [2]:
housing_df = pd.read_csv('housing.csv')

In [3]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
housing_df_marked = housing_df.fillna(-1)

In [5]:
housing_df_marked[housing_df_marked['total_bedrooms'] != -1]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [6]:
def adjusted_r2(r2, y, X):
    return 1 - ( 1- r2 ) * ( len(y) - 1 ) / ( len(y) - X.shape[1] - 1 )

## There are rows with null value for total_bedrooms feature. We need to fix it

### Approach 1. Remove these rows

In [7]:
housing_df_nulls_skipped = housing_df_marked[housing_df_marked['total_bedrooms'] != -1]

### Approach 2. Fill it with some default value, for example average value for this column

In [8]:
def impute_missing_values(df, imputer):
    imputer.fit(df['total_bedrooms'].values.reshape(-1, 1))
    total_bedrooms_imputed = imputer.transform(df['total_bedrooms'].values.reshape(-1, 1))
    housing_df_with_impute = df.copy()
    housing_df_with_impute['total_bedrooms'] = total_bedrooms_imputed
    housing_df_with_impute['total_bedrooms'] = housing_df_with_impute['total_bedrooms'].apply(lambda x: int(x))
    return housing_df_with_impute

In [9]:
housing_df_with_mean = impute_missing_values(housing_df, SimpleImputer(missing_values=np.nan, strategy='mean'))
housing_df_with_most_freq = impute_missing_values(housing_df, SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
housing_df_with_knn = impute_missing_values(housing_df, KNNImputer())

In [10]:
housing_df_marked[housing_df_marked.total_bedrooms == -1]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
290,-122.16,37.77,47.0,1256.0,-1.0,570.0,218.0,4.3750,161900.0,NEAR BAY
341,-122.17,37.75,38.0,992.0,-1.0,732.0,259.0,1.6196,85100.0,NEAR BAY
538,-122.28,37.78,29.0,5154.0,-1.0,3741.0,1273.0,2.5762,173400.0,NEAR BAY
563,-122.24,37.75,45.0,891.0,-1.0,384.0,146.0,4.9489,247100.0,NEAR BAY
696,-122.10,37.69,41.0,746.0,-1.0,387.0,161.0,3.9063,178400.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20267,-119.19,34.20,18.0,3620.0,-1.0,3171.0,779.0,3.3409,220500.0,NEAR OCEAN
20268,-119.18,34.19,19.0,2393.0,-1.0,1938.0,762.0,1.6953,167400.0,NEAR OCEAN
20372,-118.88,34.17,15.0,4260.0,-1.0,1701.0,669.0,5.1033,410700.0,<1H OCEAN
20460,-118.75,34.29,17.0,5512.0,-1.0,2734.0,814.0,6.6073,258100.0,<1H OCEAN


In [11]:
housing_df.iloc[290]

longitude              -122.16
latitude                 37.77
housing_median_age        47.0
total_rooms             1256.0
total_bedrooms             NaN
population               570.0
households               218.0
median_income            4.375
median_house_value    161900.0
ocean_proximity       NEAR BAY
Name: 290, dtype: object

In [12]:
housing_df_with_mean.iloc[290]

longitude              -122.16
latitude                 37.77
housing_median_age        47.0
total_rooms             1256.0
total_bedrooms             537
population               570.0
households               218.0
median_income            4.375
median_house_value    161900.0
ocean_proximity       NEAR BAY
Name: 290, dtype: object

In [13]:
housing_df_with_most_freq.iloc[290]

longitude              -122.16
latitude                 37.77
housing_median_age        47.0
total_rooms             1256.0
total_bedrooms             280
population               570.0
households               218.0
median_income            4.375
median_house_value    161900.0
ocean_proximity       NEAR BAY
Name: 290, dtype: object

In [14]:
housing_df_with_knn.iloc[290]

longitude              -122.16
latitude                 37.77
housing_median_age        47.0
total_rooms             1256.0
total_bedrooms             537
population               570.0
households               218.0
median_income            4.375
median_house_value    161900.0
ocean_proximity       NEAR BAY
Name: 290, dtype: object

### Approach 3. Let's predict the missing values

In [11]:
bedrooms = housing_df_nulls_skipped['total_bedrooms']
total_rooms = housing_df_nulls_skipped[['total_rooms', 'population', 'median_house_value', 'households', 'median_income']]
X_bed_train, X_bed_test, y_bed_train, y_bed_test = train_test_split(total_rooms, bedrooms, test_size=0.33, random_state=42)
bedroom_regression = LinearRegression()
bedroom_regression.fit(X_bed_train, y_bed_train)
bed_predict = bedroom_regression.predict(X_bed_test)
r2 = r2_score(y_bed_test, bed_predict)
adjusted_r2_score = adjusted_r2(r2, y_bed_test, X_bed_test)
adjusted_r2_score

0.9731269383588343

In [12]:
house_df_without_beds = housing_df_marked[housing_df_marked.total_bedrooms == -1]
bed_predictors = house_df_without_beds[['total_rooms', 'population', 'median_house_value', 'households', 'median_income']]
missed_bedrooms = bedroom_regression.predict(bed_predictors)

In [13]:
house_df_without_beds['total_bedrooms'] = missed_bedrooms 

In [14]:
house_df_full = pd.concat([house_df_without_beds, housing_df_nulls_skipped])
house_df_full['total_bedrooms'] = house_df_full['total_bedrooms'].apply(lambda x: int(x))

## Let's try different regressions on our data

In [15]:
def prepare_data_and_fit(df, regression):
    y = df['median_house_value']
    ocean_proximity = pd.get_dummies(df['ocean_proximity'])
    X_clean = pd.concat([df, ocean_proximity], axis=1).drop(columns=['ocean_proximity', 'median_house_value'   ])
    X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.33, random_state=42)
    regr = make_pipeline(StandardScaler(), regression)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, y_test, X_test)
    mse = mean_squared_error(y_test, y_pred)
    return adj_r2, mse

In [16]:
def prepare_data_and_fit_with_grid(df, regression, param_grid):
    y = df['median_house_value']
    ocean_proximity = pd.get_dummies(df['ocean_proximity'])
    X_clean = pd.concat([df, ocean_proximity], axis=1).drop(columns=['ocean_proximity', 'median_house_value'   ])
    X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.33, random_state=42)
    pipe = make_pipeline(StandardScaler(), regression)
    search = GridSearchCV(pipe, param_grid, n_jobs=2)
    search.fit(X_train, y_train)
    y_pred = search.best_estimator_.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, y_test, X_test)
    return adj_r2, mse, search.best_params_

In [36]:
def try_diff_models(housing_data_set):
    svr_grid = {
        'svr__C': [  100000, 100000],
        'svr__epsilon': [0.0001, 0.00001]
    }
    svr = SVR()
    svr_rbf_r2, svr_rbf_mse, best_params = prepare_data_and_fit_with_grid(housing_data_set, svr, svr_grid)
    print(f"SVR RBF: adjusted r^2: {svr_rbf_r2}, MSE: {svr_rbf_mse}, best params: {best_params}")

    svr_linear_grid = {
        'linearsvr__C': [  100000, 100000],
        'linearsvr__epsilon': [0.0001, 0.00001]
    }
    linearsvr = LinearSVR()
    svr_lin_r2, svr_lin_mse, best_params = prepare_data_and_fit_with_grid(housing_data_set, linearsvr, svr_linear_grid)
    print(f"SVR Linear: adjusted r^2: {svr_rbf_r2}, MSE: {svr_rbf_mse}, best params: {best_params}")
    
    lin_reg_r2, lin_reg_mse = prepare_data_and_fit(housing_data_set, LinearRegression())
    print(f"Linear regression: adjusted r^2: {lin_reg_r2}, MSE: {lin_reg_mse}")
    
    ridge_param_grid = {
        "ridge__alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    }
    ridge = Ridge()
    ridge_r2, ridge_mse, best_params = prepare_data_and_fit_with_grid(housing_data_set, ridge, ridge_param_grid)
    print(f"Ridge: adjusted r^2: {ridge_r2}, MSE: {ridge_mse}, best params: {best_params}")
    
    alpas = [0.0001, 0.001, 0.01, 0.1, 1]
    lasso_param_grid = {
        "lasso__alpha": alpas,
    }
    lasso = Lasso()
    lasso_r2, lasso_mse, best_params = prepare_data_and_fit_with_grid(housing_data_set, lasso, lasso_param_grid)
    print(f"Lasso: adjusted r^2: {lasso_r2}, MSE: {lasso_mse}, best params: {best_params}")

In [41]:
housing_datasets = {
    'null_skipped': housing_df_nulls_skipped, 
    'regressed_values': house_df_full, 
    'knn_impute': housing_df_with_knn, 
    'mean_impute': housing_df_with_mean,
    'most_freq_impute': housing_df_with_most_freq}

In [43]:
%%time
for h in housing_datasets.keys():
    print(f"{h} results:")
    try_diff_models(housing_datasets[h])
    print('*'*100)

null_skipped results:
SVR RBF: adjusted r^2: 0.7545609342051872, MSE: 3248084908.6048865, best params: {'svr__C': 100000, 'svr__epsilon': 0.0001}
SVR Linear: adjusted r^2: 0.7545609342051872, MSE: 3248084908.6048865, best params: {'linearsvr__C': 100000, 'linearsvr__epsilon': 0.0001}
Linear regression: adjusted r^2: 0.6558299230676682, MSE: 4554668708.737059
Ridge: adjusted r^2: 0.6558234390293949, MSE: 4554754517.028392, best params: {'ridge__alpha': 1}
Lasso: adjusted r^2: 0.6558276971309164, MSE: 4554698166.278984, best params: {'lasso__alpha': 1}
****************************************************************************************************
regressed_values results:
SVR RBF: adjusted r^2: 0.7590501133964429, MSE: 3231476996.7973084, best params: {'svr__C': 100000, 'svr__epsilon': 1e-05}
SVR Linear: adjusted r^2: 0.7590501133964429, MSE: 3231476996.7973084, best params: {'linearsvr__C': 100000, 'linearsvr__epsilon': 0.0001}
Linear regression: adjusted r^2: 0.6478385762722201, M