In [None]:
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

In [None]:
housing = pd.read_csv('./housing/housing.csv')

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins = 50,figsize=(20,15))
plt.show()

## split dataset

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
# X = housing.drop(['median_house_value'],axis=1)
# y = housing['median_house_value']

In [None]:
train_set, test_set = train_test_split(housing,test_size=0.2,random_state=42)

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins = [0,1.5,3.0,4.5,6,np.inf],
                              labels = [1,2,3,4,5])

In [None]:
housing['income_cat'].hist(bins = 20)

In [None]:
housing['income_cat'].head()

# stratify data

### we need to stratify the data according to the median income categorie so we can have the same proportions of median income in the test set so when we compare the median income histogram of distribution in the whole dataset it will be simillar to the hist of the test dataset

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

In [None]:
for train_index,test_index in split.split(housing,housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]

In [None]:
strat_test_set['income_cat'].value_counts()/len(strat_test_set)

In [None]:
housing['income_cat'].hist(bins = 20)
plt.title("whole dataset")
plt.show()
strat_test_set['income_cat'].hist(bins = 20)
plt.title("test")
plt.show()
strat_train_set['income_cat'].hist(bins = 20)
plt.title('train')

# explore data
## visualise for insights

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude')

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha = 0.1)

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha = 0.4,
            s = housing['population']/100, label = 'population', figsize = (15,12),
            c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True)
plt.legend()

## correlations

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attr = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attr],figsize=(12,8))

In [None]:
housing.plot(kind = 'scatter',x='median_income',y='median_house_value',alpha = 0.1)

### yaya part start

In [None]:
import seaborn as sns 

In [None]:
sns.heatmap(corr_matrix)

In [None]:
sns.scatterplot(data=housing,x='median_income',y='median_house_value',hue='housing_median_age')

### yaya part end

## attr. combinations

In [None]:
housing.head()

In [None]:
housing["rooms_per_houshold"]= housing['total_rooms']/housing['households']
housing["bedrooms_per_room"]= housing['total_bedrooms']/housing['total_rooms']
housing["population_per_houshold"]= housing['population']/housing['households']

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

### let's revert for a clean dataset and then split to features and labels

In [None]:
housing = strat_train_set.drop("median_house_value",axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()

# data cleaning

In [None]:
housing.info()

In [None]:
# first option for the NA values in the total bedrooms column is to fill it with the median of the column itself 
# we can drop the records with NA values alternatly or or drop the whole column
total_bedrooms_median = housing["total_bedrooms"].median()
housing['total_bedrooms'].fillna(total_bedrooms_median,inplace=True)

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
### now we reset it again to try Imputation
housing = strat_train_set.drop("median_house_value",axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()

In [None]:
housing.info()

## Imputation using sklearn SimpleImputer

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [None]:
# imputer only handles numerical values so we need to create a frame without any categorial (text) attr. 
# in the housing dataframe we have ocean_proximity
housing_num = housing.drop("ocean_proximity",axis=1)

In [None]:
#the imputer has. a fit method to use on the whole frame (housing_num)
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values
#just to make sure from the imputer's work

In [None]:
#now that the imputer is traind we can replace usr it on our dataframe/ training set using the transform method
X = imputer.transform(housing_num)

In [None]:
# the returned X is a plain Numpy Array with the transformed values 
X

In [None]:
# we can add it to a dataframe simply using pandas dataframe method 
housing_tr = pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)

## scikit-learn DESIGN

there's alot of design principles but for most objects

-- estimatros ( the fit() functions usage )

-- transformers ( the transform() used for transforming dataframe like in impute example )
                      { there is fit_transform() as as optimized method }

-- predictors   ( the predict() used for predicting on a dataset of new instances for example 
                    and then there's score() method that measures the quality of the predections )

## handling text and categorial data

In [None]:
housing_cat = housing[["ocean_proximity"]]

In [None]:
housing_cat.head(10)

In [None]:
housing_cat.value_counts()

In [None]:
# so this attr. is a categorial text which means that it can be devided into categories 
# we can do so using ordinal encoding giving a number for each category
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [None]:
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

### the problem with the ordinal encoder is that the ML algorithm or model may create a relation from the sequence of the number  and thier closeness to eachother which is not true in this case where 0 and 1 aren't closely related like 1 and 4 so the solution here is to creat one hot encoding which is binary representation of the categories

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

In [None]:
housing_cat_1hot

In [None]:
# it's stored as a sparse matrix for memory space
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_