In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tarfile
import urllib

#Getting data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    os.makedirs(housing_path,exist_ok=True)
    tgz_path=os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

tgz_path=os.path.join(HOUSING_PATH,"housing.tgz")
housing_tgz=tarfile.open(tgz_path)
housing_tgz.extractall(HOUSING_PATH)
housing_tgz.close()

#Retriving the data after download;
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)
    
housing=load_housing_data()

In [None]:
#Data Visualization
housing.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
#Splitting data into train/test sets
#Random Splitting
def split_train_test(data,ratio):
    shuffled_indices=np.random.permutation(len(data))
    test_set_size=int(len(data)*ratio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[test_indices],data.iloc[train_indices]


housing_test,housing_train=split_train_test(housing,.2)

In [None]:
#Splitting by Index
from zlib import crc32
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data,test_ratio,id_colum):
    ids=data[id_colum]
    in_test_set=ids.apply(lambda id_: test_set_check(id_,test_ratio))
    return data.loc[~in_test_set],data.loc[in_test_set]
    
housing_indexed=housing.reset_index()
train_set,test_set=split_train_test_by_id(housing_indexed,.2,'index')

In [None]:
#Basic data features
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=.4,s=housing['population']/100,label='population',c='median_house_value',cmap= \
    plt.get_cmap('jet'),colorbar=True,figsize=(10,7))

In [None]:
corr_matrix=housing.drop('ocean_proximity',axis=1).corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
l=['median_house_value','median_income','total_rooms','latitude']
scatter_matrix(housing[l],figsize=(12,9))

In [None]:
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)

In [None]:
#Data preprocessing
housing_train_set=train_set.dropna()
housing_train=housing_train_set.drop('median_house_value',axis=1)
housing_labels=housing_train_set['median_house_value'].copy()
housing_num_train=housing_train.dropna()
housing_cat=housing[['ocean_proximity']]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder=OrdinalEncoder()
housing_cat_encoded=ordinal_encoder.fit_transform(housing_cat)
prepared=housing_num_train.drop(['index','longitude','latitude','ocean_proximity'],axis=1)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs),
 ])
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

In [None]:
std_housing_num_train=scaler.fit_transform(prepared)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()
lin_reg.fit(std_housing_num_train,housing_labels)

In [None]:
#Data Cleaning & Preparation
#Pipeline
#Make Custom transformer:num_scaling,catergory,
#Model Selection and Fitting


In [None]:
housing_data=pd.read_csv('datasets/housing/housing.csv')

In [None]:
housing_data.hist(bins=20,figsize=(20,15))

In [None]:
from sklearn.tree import DecisionTreeRegressor

tre_reg=DecisionTreeRegressor()

In [None]:
tre_reg.fit(X,y)