In [None]:
from pathlib import Path
import pandas as pd
import numpy as np 
import tarfile
import urllib.request
from zlib import crc32
import matplotlib.pyplot as plt

In [None]:
housing=pd.read_csv('housing.csv')

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50,figsize=(12,8))
plt.show()

In [None]:
def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set , test_set = shuffle_and_split_data(housing , 0.2)
len(train_set)

In [None]:
len(test_set)

In [None]:
np.random.seed(42)

In [None]:
def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]



In [None]:
# 1. Hash-based split
housing_with_id = housing.reset_index()
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set_hashed, test_set_hashed = split_data_with_id_hash(housing_with_id, 0.2, "id")

# 2. Sklearn split (optional, for comparison)
from sklearn.model_selection import train_test_split
train_set_sklearn, test_set_sklearn = train_test_split(housing, test_size=0.2, random_state=42)

# Now check the hash-based test set (this should give you 44!)
print("Hash-based test set null bedrooms:", 
      test_set_hashed["total_bedrooms"].isnull().sum())