In [1]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
from sklearn.datasets import fetch_california_housing

In [28]:
boston=fetch_california_housing()

In [29]:
boston.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [30]:
print(boston.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [40]:
print(boston.feature_names)
print(boston.target_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
['MedHouseVal']


In [32]:
df=pd.DataFrame(boston.data,columns=boston.feature_names)

In [36]:
df.head()
df['price']=boston.target

In [37]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,quality,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,3.422


In [65]:
df.corr()
df.drop(['quality'],inplace=True,axis='columns')
## from here we can check correlation between two variables

In [66]:
### independent and dependent features
X=df.drop(['price'],axis='columns')
y=df['price']

In [67]:
### test and train
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [68]:
## feature scaling and feature transformation
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train_scaled=sc.fit_transform(X_train)
X_test_scaled=sc.transform(X_test)

In [69]:
## Model creation
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train_scaled,y_train)


In [70]:
## model testing
y_pred=model.predict(X_test_scaled)


In [71]:
## Model accuracy score
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))


0.5305677824766752
0.5272474538305955


In [72]:
score=r2_score(y_test,y_pred)

In [73]:
score

0.5957702326061664

In [74]:
## new data prediction
##  reshape the data from one dimesion to 2 dimension
boston.data[0].shape

(8,)

In [77]:

data_point = boston.data[0]
data_point = data_point.reshape(1, -1)
scaler_data=sc.transform(data_point)



In [79]:
model.predict(scaler_data)

array([4.14333441])

In [80]:
## Picking the model for deployment
import pickle

In [82]:
import pickle
filename="house_price.sav"
pickle.dump(model,open(filename,'wb'))