In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
data=pd.read_csv("housing.csv")

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data['ocean_proximity'].value_counts()

In [None]:
%matplotlib inline
data.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_set,test_set= train_test_split(data,test_size=0.2, random_state=20)

In [None]:
data['income_cat']=pd.cut(data["median_income"],bins=[0.,1.5,3.0,4.5,6.,np.inf], labels=[1,2,3,4,5])

In [None]:
data['income_cat'].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=20)
for train_index,test_index in split.split(data,data['income_cat']):
    strat_train_set=data.loc[train_index]
    strat_test_set=data.loc[test_index]

In [None]:
strat_test_set['income_cat'].value_counts()/len(strat_test_set)

### Removing income_cat

In [None]:
for set_ in(strat_train_set,strat_test_set):
    set_.drop("income_cat",axis=1,inplace=True)

 ### Visualizing Graphical Data

In [None]:
housing=strat_train_set.copy()

In [None]:
housing.plot(kind='scatter',x='longitude', y='latitude',alpha=0.1)

In [None]:
housing.plot(kind='scatter',x='longitude', y='latitude',alpha=0.4,s=housing['population']/100,label='population',figsize=(10,7),c='median_house_value',cmap=plt.get_cmap('jet'),colorbar=True)
plt.legend()

### Correlation

In [None]:
corr_matrix=housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes=['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
housing.plot(kind='scatter',x='median_income', y='median_house_value', alpha=0.1)

### Attribute Combinations

In [None]:
housing["rooms_per_household"]= housing['total_rooms']/housing['households']
housing["bedrooms_per_room"]= housing['total_bedrooms']/housing['total_rooms']
housing["population_per_household"]= housing['population']/housing['households']

In [None]:
corr_matrix= housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

#### revert to clean training set
###### we don't apply transformations to predictors and targets

In [None]:
housing=strat_train_set.drop('median_house_value',axis=1)
housing_labels= strat_train_set['median_house_value'].copy()

### Data Cleaning

In [None]:
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(strategy='median')

In [None]:
# since it can only be applied on numerical value, we remove categorical data
housing_num=housing.drop("ocean_proximity",axis=1)

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X=imputer.transform(housing_num)

In [None]:
housing_tr=pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)

In [None]:
housing_cat=housing[['ocean_proximity']]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder   # to convert categories from text to numbers

ordinal_encoder= OrdinalEncoder()
housing_cat_encoded=ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
# applying one hot encoding to avoid confusions

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_encoder=OneHotEncoder()
housing_cat_1hot=cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
# the above is stored as SciPy Sparse matrix,to convert it to dense numpy array-
housing_cat_1hot.toarray()

### custom transformers

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin

rooms_ix,bedrooms_ix, population_ix, households_ix= 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room = True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X, y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,households_ix]
        population_per_household=X[:,population_ix]/X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room= X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return  np.c_[X,rooms_per_household,population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs= attr_adder.transform(housing.values)

## Feature Scaling

In [None]:
housing.describe()

### transformation Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
num_pipeline=Pipeline([
                ('imputer', SimpleImputer(strategy="median")),
                ('attribs_adder', CombinedAttributesAdder()),
                ('std_scaler', StandardScaler())
])

housing_num_tr=num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs=list(housing_num)
cat_attribs=['ocean_proximity']

full_pipeline=ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(),cat_attribs),
])

housing_prepared=full_pipeline.fit_transform(housing)

## Selecting and Training  a Model

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(housing_prepared,housing_labels)

In [None]:
some_data=housing.iloc[:5]
some_labels= housing_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)

In [None]:
print("Predictions:", reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

### Finding Root Mean Square Error

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions=reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels,housing_predictions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

In [None]:
#the above is an example of model underfitting the training set

### Trying Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

In [None]:
housing_predictions=tree_reg.predict(housing_prepared)
tree_mse=mean_squared_error(housing_labels,housing_predictions)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

In [None]:
#the above might be the case of overfitting 

### Using Cross-Validation 

In [None]:
from sklearn.model_selection import cross_val_score

scores= cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores=np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("scores:",scores)
    print("mean score:", scores.mean())
    print("std:",scores.std())
    
display_scores(tree_rmse_scores)
         

In [None]:
# the above case is also not correct, it is overfitting

### Trying Random Forests

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg= RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)

In [None]:
housing_predictions=forest_reg.predict(housing_prepared)
forest_mse=mean_squared_error(housing_labels,housing_predictions)
forest_rmse=np.sqrt(forest_mse)
forest_rmse

In [None]:
scores= cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores=np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("scores:",scores)
    print("mean score:", scores.mean())
    print("std:",scores.std())
    
display_scores(forest_rmse_scores)

### Saving work

In [None]:
import joblib

### Fine Tuning Model 

#### grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid=[
    {'n_estimators':[3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10], 'max_features':[2,3,4]},
]

forest_reg=RandomForestRegressor()

grid_search=GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(housing_prepared,housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres=grid_search.cv_results_

In [None]:
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importance=grid_search.best_estimator_.feature_importances_
feature_importance

In [None]:
extra_attribs = ['rooms_per_hhold','pop_per_hhold','bedrooms_per_room']
cat_encoder= full_pipeline.named_transformers_['cat']
cat_one_hot_attribs= list(cat_encoder.categories_[0])
attributes=num_attribs+extra_attribs+cat_one_hot_attribs
sorted(zip(feature_importance,attributes), reverse=True)

### Evaluate Your ystem on the Test set

In [None]:
final_model=grid_search.best_estimator_

In [None]:
X_test=strat_test_set.drop('median_house_value', axis=1)
Y_test=strat_test_set['median_house_value'].copy()

In [None]:
X_test_prepared=full_pipeline.transform(X_test)

In [None]:
final_predictions=final_model.predict(X_test_prepared)

In [None]:
final_mse=mean_squared_error(Y_test, final_predictions)
final_rmse=np.sqrt(final_mse)
final_rmse

In [None]:
from scipy import stats

confidence= 0.95
squared_errors=(final_predictions - Y_test)**2
np.sqrt(stats.t.interval(confidence,len(squared_errors)-1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))