In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [3]:
import joblib

In [4]:
from sklearn.datasets import fetch_california_housing

In [5]:
housing_data = fetch_california_housing()

In [6]:
data = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)

In [7]:
data.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
857,2.9706,36.0,4.184783,1.021739,295.0,3.206522,37.6,-122.09
17282,5.4127,25.0,6.041791,0.931343,907.0,2.707463,34.42,-119.73
9090,4.9,17.0,6.501706,1.129693,1867.0,3.186007,34.66,-118.22
12199,3.235,12.0,6.09465,1.090535,757.0,3.115226,33.64,-117.19
3836,5.0,37.0,4.368876,0.991354,983.0,2.832853,34.19,-118.44


In [8]:
housing_data.target_names

['MedHouseVal']

In [9]:
data['MedHouseVal'] = housing_data.target

In [10]:
data.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
4958,2.3942,42.0,4.329787,1.18617,511.0,2.718085,34.01,-118.29,1.232
18489,4.3227,16.0,5.711704,1.007745,3575.0,3.076592,37.01,-121.59,2.515
7229,1.9688,44.0,3.561404,1.093567,1175.0,3.435673,34.02,-118.16,1.739
17060,3.7463,30.0,4.976431,0.993266,1191.0,2.005051,37.49,-122.24,4.276
15129,2.5365,10.0,4.873518,1.104084,2310.0,3.043478,32.86,-116.91,1.395


In [11]:
data.drop(['AveBedrms'], axis=1, inplace=True)

In [12]:
X = data.drop(['MedHouseVal'], axis=1)
y = data['MedHouseVal']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=666)

In [14]:
X_test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,Latitude,Longitude
6015,3.7933,34.0,5.327869,870.0,4.754098,34.07,-117.79
16296,2.401,37.0,4.492582,1121.0,3.326409,37.97,-121.22
20597,2.1111,33.0,4.927273,888.0,2.306494,39.16,-121.58
11111,3.5329,31.0,4.5,1524.0,3.356828,33.84,-117.9
3300,2.3179,14.0,6.138298,638.0,2.262411,38.94,-122.62


In [15]:
X_test.head().values

array([[   3.7933    ,   34.        ,    5.32786885,  870.        ,
           4.75409836,   34.07      , -117.79      ],
       [   2.401     ,   37.        ,    4.4925816 , 1121.        ,
           3.3264095 ,   37.97      , -121.22      ],
       [   2.1111    ,   33.        ,    4.92727273,  888.        ,
           2.30649351,   39.16      , -121.58      ],
       [   3.5329    ,   31.        ,    4.5       , 1524.        ,
           3.35682819,   33.84      , -117.9       ],
       [   2.3179    ,   14.        ,    6.13829787,  638.        ,
           2.26241135,   38.94      , -122.62      ]])

In [16]:
dtree = DecisionTreeRegressor()

In [17]:
dtree.fit(X_train, y_train)

In [18]:
y_pred = dtree.predict(X_test)

In [19]:
from sklearn.metrics import r2_score

In [20]:
r2_score(y_test, y_pred)

0.5916859378859114

In [21]:
knn = KNeighborsRegressor(n_neighbors=9, n_jobs=-1)

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
knn.fit(X_train, y_train)

In [25]:
y_pred = knn.predict(X_test)

In [26]:
r2_score(y_test, y_pred)

0.6722840202333993

In [27]:
joblib.dump(scaler, 'KNN_Standard_scaler.pkl')
joblib.dump(knn, 'KNN_Calofornia_housing.pkl')

['KNN_Calofornia_housing.pkl']