In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [15]:
df = pd.read_csv('housing.csv')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20637 non-null  float64
 3   total_rooms         20635 non-null  float64
 4   total_bedrooms      20429 non-null  float64
 5   population          20628 non-null  float64
 6   households          20630 non-null  float64
 7   median_income       20631 non-null  float64
 8   median_house_value  20632 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [17]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [18]:
df[df.isnull().any(axis=1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
290,-122.16,37.77,47.0,1256.0,,570.0,218.0,4.3750,161900.0
341,-122.17,37.75,38.0,992.0,,732.0,259.0,1.6196,85100.0
532,-122.27,37.78,52.0,1408.0,,718.0,265.0,2.6806,207900.0
538,-122.28,37.78,29.0,5154.0,,3741.0,1273.0,2.5762,173400.0
563,-122.24,37.75,45.0,891.0,,384.0,146.0,4.9489,247100.0
...,...,...,...,...,...,...,...,...,...
20268,-119.18,34.19,19.0,2393.0,,1938.0,762.0,1.6953,167400.0
20372,-118.88,34.17,15.0,4260.0,,1701.0,669.0,5.1033,410700.0
20460,-118.75,34.29,17.0,5512.0,,2734.0,814.0,6.6073,258100.0
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0


In [19]:
from sklearn.neighbors import KNeighborsRegressor

# function that imputes a dataframe
def impute_knn(df):

    ''' inputs: pandas df containing feature matrix '''
    ''' outputs: dataframe with NaN imputed '''
    # imputation with KNN unsupervised method

    # separate dataframe into numerical/categorical
    ldf = df.select_dtypes(include=[np.number])           # select numerical columns in df
    ldf_putaside = df.select_dtypes(exclude=[np.number])  # select categorical columns in df
    # define columns w/ and w/o missing data
    cols_nan = ldf.columns[ldf.isna().any()].tolist()         # columns w/ nan
    cols_no_nan = ldf.columns.difference(cols_nan).values     # columns w/o nan

    for col in cols_nan:
        imp_test = ldf[ldf[col].isna()]   # indicies which have missing data will become our test set
        imp_train = ldf.dropna()          # all indicies which which have no missing data
        model = KNeighborsRegressor(n_neighbors=5)  # KNR Unsupervised Approach
        knr = model.fit(imp_train[cols_no_nan], imp_train[col])
        ldf.loc[df[col].isna(), col] = knr.predict(imp_test[cols_no_nan])

    return pd.concat([ldf,ldf_putaside],axis=1)

In [20]:
# Call function that imputes missing data
df2 = impute_knn(df)
# looks like we have a full feature matrix
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


##Splitting The Data into Train and Test

In [25]:
X = df2.drop('median_house_value', axis=1)
y = df2['median_house_value']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

##Using Simple Linear Regression to create the model

In [27]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# Evaluate the model using Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE on test set:", rmse)

RMSE on test set: 71022.46110700462


##Using Random Forest to create Model

In [31]:
randomForest_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
])

randomForest_pipeline.fit(X_train, y_train)

y_pred = randomForest_pipeline.predict(X_test)

# Evaluate the model using Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Random Forest RMSE on test set:", rmse)

Random Forest RMSE on test set: 49580.06947285523


Since RMSE on Random Forest was lesser (\$50,000 Random Forest < $70,000
 Linear Regression) it will be utilised

In [33]:
import joblib

joblib.dump(randomForest_pipeline, 'rf_model.pkl')

['rf_model.pkl']