In [1]:
import os
import numpy as np
import pandas as pd

In [4]:
def load_housing_data():
    DATA_PATH = 'input/hands-on-machine-learning-housing-dataset/housing.csv'
    data = pd.read_csv(DATA_PATH)
    return data

In [5]:
housing = load_housing_data()

# Studying the data:

In [6]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))

# Creating a test set, 1st way:
**This way of spliting test and training set is not good, because every time I run the code, the test set changes, that is, there are instances in the new test set that were previously in the training set. Doing things this ways makes the test set corrupted, poluted with the training one.**

In [None]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data)) #returns a shuffled numpy array
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

##so the np.random.permutation shuffles the order of the rows in a DataFrame and returns an np array    
#df_test = pd.DataFrame({'column_1':[1,2,3,4], 'column_2':[5,6,7,8]})
#print(df_test)
#np.random.permutation(df_test)
#np.random.permutation(10)

**A few new things:**

In [None]:
help(np.random.permutation)
help(pd.DataFrame.iloc)

**Using the split_train_test() function:**

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
len(train_set)

In [None]:
len(test_set)

# Creating a test set, 2nd way:
**This way the test set won't contain instances that have been in the train set**

In [None]:
from zlib import crc32
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set] # ~ is used to compare binary numbers

## The lambda keyword is used to create small anonymous functions.
## A lambda function can take any number of arguments, but can only have one expression.
## The expression is evaluated and the result is returned.

**A few new things**

In [None]:
help(pd.DataFrame.loc) # Access a group of rows and columns by label(s) or a boolean array.
help(pd.DataFrame.apply) #apply a function along the axis of a DataFrame

creatind a new dataframe, but with an id:

In [None]:
housing_with_id = housing.reset_index() #adds an 'index' column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
housing_with_id

**But those indices are not that unique, for exmple if the data gets changed. One must choose a better unique characteristc, for exemple longitude together with latitude wont change. A better aproach is try to combine them.**