# ML

In [None]:
import os
import tarfile
from six.moves import urllib

In [None]:
def fetch(url, dest):
    name = os.path.basename(url)
    csv_path = os.path.join(dest,name)
    if not os.path.exists(dest):
        os.makedirs(dest)
    if not os.path.exists(csv_path):
        urllib.request.urlretrieve(url, csv_path)

In [None]:
url = os.path.join(
    "https://raw.githubusercontent.com/ageron/handson-ml/master/",
    "datasets/housing/housing.csv"
)

dest = os.path.join(os.getcwd(), "housing")

fetch(url, dest)

In [None]:
import pandas as pd

In [None]:
csv_path = os.path.join(dest, "housing.csv")
df=pd.read_csv(csv_path)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df["ocean_proximity"].value_counts()

In [None]:
df.describe()

In [None]:
from matplotlib import pyplot as plt
df.hist(bins=50,figsize=(20,15));

In [None]:
import random
import numpy as np
random.seed(42)
np.random.seed(42)

In [None]:
teset_ratio=0.2

In [None]:
def split_dataset(data,test_ratio):
    shuffled_indices=np.random.permutation(len(data))
    test_size=int(len(data)*test_ratio)
    test=data[:test_size]
    train=data[test_size:]
    return train,test

In [None]:
train,test=split_dataset(data=df,test_ratio=0.2)
print(len(train),len(test))
print(len(train)/len(df),len(test)/len(df))

In [None]:
assert len(train)+len(test)==len(df)

In [None]:
import hashlib

In [None]:
csv_path = os.path.join(dest, "housing.csv")
df=pd.read_csv(csv_path)

In [None]:
def split_with_hash(data,test_ratio,target_column,hash=hashlib.md5):
    def is_test(i):
        h=hash(np.int64(i))
        return h.digest()[-1]<256*test_ratio
    indices = data[target_column]
    test_indices=indices.apply(lambda i:is_test(i))
    train=data[~test_indices]
    test=data[test_indices]
    return train,test

In [None]:
df_with_id=df.reset_index()
train,test=split_with_hash(df_with_id,0.2,"index")