In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
import pyodbc

In [2]:
conn = pyodbc.connect(
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=prdsql05.westfund.com.au;'
    'DATABASE=BRONZE;'
    'Trusted_Connection=yes;'
)

In [17]:
start_date = "20251101"
end_date   = "20251130"

table_prefix_list = [
    "person"
    # "person_membership"
]
# table_prefix_list = [
#     "memship",
#     "person"
# ]

In [18]:
output_folder = r"\\prdeqs01\QlikData\bronze_snapshots_backup"

In [12]:
cursor = conn.cursor()

In [19]:
# =============================
# 循环每个前缀
# =============================
for table_prefix in table_prefix_list:

    print(f"\n=== Processing prefix: {table_prefix} ===")

    # 查找 snapshot 表
    query_tables = f"""
    SELECT TABLE_NAME
    FROM INFORMATION_SCHEMA.TABLES
    WHERE TABLE_SCHEMA = 'dbo'
      AND TABLE_NAME LIKE '{table_prefix}_%' ESCAPE '\\'
    """
    tables = [row[0] for row in cursor.execute(query_tables).fetchall()]

    # 匹配表名中的日期
    pattern = re.compile(rf"{table_prefix}_(\d{{8}})")
    valid_tables = []

    # 过滤具体日期范围
    for t in tables:
        match = pattern.match(t)
        if match:
            full_date = match.group(1)  # YYYYMMDD
            if start_date <= full_date <= end_date:
                valid_tables.append((t, full_date))

    if not valid_tables:
        print(f"No snapshots found for prefix {table_prefix}.")
        continue

    print(f"Selected snapshot tables: {valid_tables}")

    # =============================
    # 读取每个 snapshot 表并增加 snapshot_date
    # =============================
    df_list = []
    for table_name, snapshot_date in valid_tables:
        print(f"Reading {table_name} ...")
        df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)
        df["snapshot_date"] = snapshot_date
        df_list.append(df)

    # =============================
    # 合并所有 snapshot 表
    # =============================
    df_all = pd.concat(df_list, ignore_index=True)

    # =============================
    # 输出 parquet 文件
    # =============================
    file_name = f"{table_prefix}_snapshot_{start_date}_{end_date}.parquet"
    output_path = os.path.join(output_folder, file_name)
    df_all.to_parquet(output_path, index=False)

    print(f"Saved parquet → {output_path}")


=== Processing prefix: person ===
Selected snapshot tables: [('person_20251101', '20251101'), ('person_20251102', '20251102'), ('person_20251103', '20251103'), ('person_20251104', '20251104'), ('person_20251105', '20251105'), ('person_20251106', '20251106'), ('person_20251107', '20251107'), ('person_20251108', '20251108'), ('person_20251109', '20251109'), ('person_20251110', '20251110'), ('person_20251111', '20251111'), ('person_20251112', '20251112'), ('person_20251113', '20251113'), ('person_20251115', '20251115'), ('person_20251116', '20251116'), ('person_20251117', '20251117'), ('person_20251118', '20251118'), ('person_20251119', '20251119'), ('person_20251120', '20251120'), ('person_20251121', '20251121'), ('person_20251122', '20251122'), ('person_20251123', '20251123'), ('person_20251124', '20251124'), ('person_20251125', '20251125'), ('person_20251126', '20251126'), ('person_20251127', '20251127'), ('person_20251128', '20251128'), ('person_20251129', '20251129'), ('person_20251

  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251102 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251103 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251104 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251105 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251106 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251107 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251108 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251109 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251110 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251111 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251112 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251113 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251115 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251116 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251117 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251118 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251119 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251120 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251121 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251122 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251123 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251124 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251125 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251126 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251127 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251128 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251129 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Reading person_20251130 ...


  df = pd.read_sql(f"SELECT * FROM dbo.{table_name}", conn)


Saved parquet → \\prdeqs01\QlikData\bronze_snapshots_backup\person_snapshot_20251101_20251130.parquet


In [20]:
# parquet 文件路径
file_name = "person_snapshot_20251101_20251130.parquet"
file_path = os.path.join(output_folder, file_name)

# 读取 parquet
df_person = pd.read_parquet(file_path)

# 筛选指定 snapshot_date
snapshot_date = "20251128"
df_snapshot = df_person[df_person["snapshot_date"] == snapshot_date]

In [21]:
len(df_snapshot)

377472

In [22]:
info_list = []

for col in df_snapshot.columns:
    info_list.append({
        "column": col,
        "non_null_count": df_snapshot[col].notnull().sum(),
        "dtype": df_snapshot[col].dtype
    })

df_info = pd.DataFrame(info_list)

df_info


Unnamed: 0,column,non_null_count,dtype
0,person_id,377472,float64
1,correspondence_level,10497,float64
2,title,377472,float64
3,gender,377471,object
4,hear_about_id,157885,float64
...,...,...,...
60,upper_firstname,377472,object
61,upper_maidenname,153590,object
62,passport_number,0,object
63,passport_country_code,0,object


In [14]:
df_snapshot[df_snapshot['membership_id'] == 58323]
# df_snapshot[df_snapshot['medicare_number'] == '4242956938']

Unnamed: 0,person_id,membership_id,person_memship_version,relationship,status_flag,termination_code,hicaps_identifier,join_date,termination_date,create_operator,create_datetime,update_operator,update_datetime,timestamp,delta,eligibility_status,newborn_source,newborn_rebate_selected,rebate_form_printed,snapshot_date
1582208,42834.0,58323.0,100.0,4,T,X,3.0,1998-10-21,2004-02-01,Database Created,2004-04-01 21:52:25.927,,NaT,b'\x00\x00\x00\x00\x03h\x9d\xeb',1.0,,,,,20251005
1582209,42835.0,58323.0,100.0,5,T,X,4.0,1998-10-21,2003-07-12,Database Created,2004-04-01 21:52:25.927,,NaT,b'\x00\x00\x00\x00\x03h\x9d\xec',1.0,,,,,20251005
1640670,42832.0,58323.0,100.0,1,A,,1.0,1998-10-21,NaT,Database Created,2004-04-01 21:52:33.317,,NaT,b'\x00\x00\x00\x00\x03h\x9d\xe9',1.0,,,,,20251005
1640671,42833.0,58323.0,100.0,2,A,,2.0,1998-10-21,NaT,Database Created,2004-04-01 21:52:33.317,,NaT,b'\x00\x00\x00\x00\x03h\x9d\xea',1.0,,,,,20251005
