In [15]:
import sys
from pathlib import Path

# Add project root to sys.path (so config.py is found)
root = Path().resolve().parent  # one level up from notebooks/
sys.path.append(str(root))

from config import RAW_DIR, INTERIM_DIR
import pandas as pd

print("RAW_DIR:", RAW_DIR)
print("INTERIM_DIR:", INTERIM_DIR)


RAW_DIR: C:\DataProjects\uh-ds-housing-data\data\raw
INTERIM_DIR: C:\DataProjects\uh-ds-housing-data\data\interim


In [12]:
import pandas as pd
from config import RAW_DIR, INTERIM_DIR

csv_path = RAW_DIR / "price_paid_records.csv"
df_head = pd.read_csv(csv_path, nrows=3)
print(df_head.columns.tolist())

# sample = pd.read_csv(csv_path, nrows=5, usecols=["Date of Transfer"])
# print(sample)

sample = pd.read_csv(csv_path, usecols=["Date of Transfer"], parse_dates=["Date of Transfer"])
years = sample["Date of Transfer"].dt.year.value_counts().sort_index()
print(years.tail(10))


['Transaction unique identifier', 'Price', 'Date of Transfer', 'Property Type', 'Old/New', 'Duration', 'Town/City', 'District', 'County', 'PPDCategory Type', 'Record Status - monthly file only']
Date of Transfer
2008     650492
2009     625662
2010     663342
2011     661055
2012     668295
2013     810111
2014     982943
2015    1007421
2016    1032558
2017     375098
Name: count, dtype: int64


In [13]:
# --- File paths ---
subset_path = INTERIM_DIR / "uk_housing_2010_2017.csv"

# --- Columns from the dataset ---
usecols = [
    'Price',
    'Date of Transfer',
    'Property Type',
    'Old/New',
    'Duration',
    'Town/City',
    'District',
    'County',
    'PPDCategory Type',
    'Record Status - monthly file only'
]

chunk_size = 500_000
chunks = []

for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size, usecols=usecols)):
    # Clean column names for consistency
    chunk.columns = [
        'price', 'date_of_transfer', 'property_type', 'old_new',
        'duration', 'town_city', 'district', 'county',
        'ppdcategory_type', 'record_status'
    ]
    
    # Convert date and filter rows by year
    chunk['date_of_transfer'] = pd.to_datetime(chunk['date_of_transfer'], errors='coerce')
    chunk = chunk[chunk['date_of_transfer'].dt.year >= 2010]

    if not chunk.empty:
        chunks.append(chunk)
    
    print(f"Processed chunk {i+1} — Rows kept: {len(chunk)}")

# Combine all filtered chunks
df_recent = pd.concat(chunks, ignore_index=True)

# Save smaller subset
df_recent.to_csv(subset_path, index=False)

print(f"\n Saved filtered subset: {subset_path}")
print(f"Final shape: {df_recent.shape}")


Processed chunk 1 — Rows kept: 0
Processed chunk 2 — Rows kept: 0
Processed chunk 3 — Rows kept: 0
Processed chunk 4 — Rows kept: 0
Processed chunk 5 — Rows kept: 0
Processed chunk 6 — Rows kept: 0
Processed chunk 7 — Rows kept: 0
Processed chunk 8 — Rows kept: 0
Processed chunk 9 — Rows kept: 0
Processed chunk 10 — Rows kept: 0
Processed chunk 11 — Rows kept: 0
Processed chunk 12 — Rows kept: 0
Processed chunk 13 — Rows kept: 0
Processed chunk 14 — Rows kept: 0
Processed chunk 15 — Rows kept: 0
Processed chunk 16 — Rows kept: 0
Processed chunk 17 — Rows kept: 0
Processed chunk 18 — Rows kept: 0
Processed chunk 19 — Rows kept: 0
Processed chunk 20 — Rows kept: 0
Processed chunk 21 — Rows kept: 0
Processed chunk 22 — Rows kept: 0
Processed chunk 23 — Rows kept: 0
Processed chunk 24 — Rows kept: 0
Processed chunk 25 — Rows kept: 0
Processed chunk 26 — Rows kept: 0
Processed chunk 27 — Rows kept: 0
Processed chunk 28 — Rows kept: 0
Processed chunk 29 — Rows kept: 0
Processed chunk 30 — Ro

In [14]:
df = pd.read_csv(INTERIM_DIR / "uk_housing_2010_2017.csv")
print(df['date_of_transfer'].min(), df['date_of_transfer'].max())
print(df.shape)

2010-01-01 2017-06-29
(6200823, 10)


In [None]:
# === 4️ Summary Statistics ===
print("\nDescriptive Statistics (Prices):")
print(df["price"].describe())

# Convert year and month for temporal analysis
df["year"] = df["date_of_transfer"].dt.year
df["month"] = df["date_of_transfer"].dt.month
