# Oracle Table Diagnostics v2
## Auto-discovers columns first, then checks for issues

Fixes: Discovers actual column names before running checks

In [1]:
# ============================================================
# CONNECTION CONFIGURATION - UPDATE THESE VALUES
# ============================================================


ORACLE_HOST = "10.1.102.53:24625/1625dm"  # Oracle server hostname
ORACLE_PORT = "1521"                                      # Usually 1521
ORACLE_SERVICE = "COCCDM"                                 # Service name or SID
ORACLE_USER = "smith1625"                             # Oracle username
ORACLE_PASSWORD = "v0c4l1z3!"                         # Oracle password

# Connection string format (update if using different format)
# TNS format: ORACLE_DSN = "(DESCRIPTION=(ADDRESS=(PROTOCOL=TCP)(HOST={host})(PORT={port}))(CONNECT_DATA=(SERVICE_NAME={service})))"
ORACLE_DSN = "10.1.102.53:24625/1625dm"

SCHEMA = "COCCDM"

TABLES = ['WH_LOANS', 'WH_ACCTCOMMON', 'WH_ACCT']

In [2]:
import oracledb
import pandas as pd
from datetime import datetime

# Comment out next line for thin mode (no Oracle client)
# oracledb.init_oracle_client()

print(f"Connecting to {ORACLE_DSN}...")
conn = oracledb.connect(user=ORACLE_USER, password=ORACLE_PASSWORD, dsn=ORACLE_DSN)
print(f"Connected! Oracle version: {conn.version}")

Connecting to 10.1.102.53:24625/1625dm...
Connected! Oracle version: 19.26.0.0.0


## 1. DISCOVER SCHEMA - What columns actually exist?

In [3]:
# Get all columns for each table
table_schemas = {}

for table in TABLES:
    print(f"\n{'='*60}")
    print(f"{table} - COLUMN SCHEMA")
    print(f"{'='*60}")
    
    df = pd.read_sql(f"""
        SELECT column_name, data_type, data_precision, data_scale, nullable
        FROM all_tab_columns
        WHERE owner = '{SCHEMA}'
          AND table_name = '{table}'
        ORDER BY column_id
    """, conn)
    
    table_schemas[table] = df
    print(df.to_string())
    
    # Summary
    date_cols = df[df['DATA_TYPE'] == 'DATE']['COLUMN_NAME'].tolist()
    num_cols = df[df['DATA_TYPE'] == 'NUMBER']['COLUMN_NAME'].tolist()
    print(f"\nDATE columns: {date_cols}")
    print(f"NUMBER columns: {num_cols}")


WH_LOANS - COLUMN SCHEMA
           COLUMN_NAME DATA_TYPE  DATA_PRECISION  DATA_SCALE NULLABLE
0              ACCTNBR    NUMBER            22.0         0.0        N
1              RUNDATE      DATE             NaN         NaN        N
2                  OCC  VARCHAR2             NaN         NaN        Y
3               STATUS  VARCHAR2             NaN         NaN        Y
4              ORIGBAL    NUMBER            22.0         0.0        Y
5             CURRTERM    NUMBER            22.0         0.0        Y
6                 INTC    NUMBER            22.0         0.0        Y
7                   PF  VARCHAR2             NaN         NaN        Y
8                DELYR    NUMBER            22.0         0.0        Y
9              DELLIFE    NUMBER            22.0         0.0        Y
10              LCRATE  VARCHAR2             NaN         NaN        Y
11               OLDPI    NUMBER            22.0         2.0        Y
12             ORIGINT    NUMBER            22.0         7.0    

  df = pd.read_sql(f"""
  df = pd.read_sql(f"""


               COLUMN_NAME DATA_TYPE  DATA_PRECISION  DATA_SCALE NULLABLE
0                  ACCTNBR    NUMBER            22.0         0.0        N
1                  EFFDATE      DATE             NaN         NaN        N
2               MONTHENDYN      CHAR             NaN         NaN        N
3              MJACCTTYPCD  VARCHAR2             NaN         NaN        N
4          CURRMIACCTTYPCD  VARCHAR2             NaN         NaN        N
5                  PRODUCT  VARCHAR2             NaN         NaN        N
6           CURRACCTSTATCD  VARCHAR2             NaN         NaN        N
7      CURRACCTSTATEFFDATE      DATE             NaN         NaN        Y
8      ACCTOPENCURRMONTHYN      CHAR             NaN         NaN        N
9     ACCTCLOSECURRMONTHYN      CHAR             NaN         NaN        N
10            BRANCHORGNBR    NUMBER            22.0         0.0        N
11              BRANCHNAME  VARCHAR2             NaN         NaN        Y
12              BANKORGNBR    NUMBER  

  df = pd.read_sql(f"""


## 2. ROW COUNTS

In [4]:
print("TABLE ROW COUNTS")
print("="*60)

for table in TABLES:
    try:
        df = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {SCHEMA}.{table}", conn)
        print(f"{table}: {df['CNT'].iloc[0]:,} rows")
    except Exception as e:
        print(f"{table}: ERROR - {e}")

TABLE ROW COUNTS


  df = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {SCHEMA}.{table}", conn)


WH_LOANS: 119,050,162 rows
WH_ACCTCOMMON: 254,461,030 rows
WH_ACCT: 262,386,697 rows


## 3. CHECK ALL DATE COLUMNS FOR INVALID VALUES

In [None]:
# Check every DATE column in every table for invalid values
invalid_dates_found = []

for table in TABLES:
    print(f"\n{'='*60}")
    print(f"{table} - DATE COLUMN VALIDATION")
    print(f"{'='*60}")
    
    # Get DATE columns for this table
    date_cols = table_schemas[table][table_schemas[table]['DATA_TYPE'] == 'DATE']['COLUMN_NAME'].tolist()
    
    if not date_cols:
        print("No DATE columns found")
        continue
    
    for col in date_cols:
        try:
            # Check for dates outside reasonable range
            df = pd.read_sql(f"""
                SELECT 
                    COUNT(*) as total_rows,
                    COUNT({col}) as non_null,
                    MIN({col}) as min_date,
                    MAX({col}) as max_date,
                    SUM(CASE WHEN EXTRACT(YEAR FROM {col}) < 1900 THEN 1 ELSE 0 END) as before_1900,
                    SUM(CASE WHEN EXTRACT(YEAR FROM {col}) > 2100 THEN 1 ELSE 0 END) as after_2100,
                    SUM(CASE WHEN EXTRACT(YEAR FROM {col}) < 1 OR EXTRACT(YEAR FROM {col}) > 9999 THEN 1 ELSE 0 END) as outside_dotnet
                FROM {SCHEMA}.{table}
            """, conn)
            
            before_1900 = df['BEFORE_1900'].iloc[0] or 0
            after_2100 = df['AFTER_2100'].iloc[0] or 0
            outside_dotnet = df['OUTSIDE_DOTNET'].iloc[0] or 0
            min_dt = df['MIN_DATE'].iloc[0]
            max_dt = df['MAX_DATE'].iloc[0]
            
            status = "OK"
            if outside_dotnet > 0:
                status = f"*** CRITICAL: {outside_dotnet} outside .NET range ***"
                invalid_dates_found.append((table, col, outside_dotnet, min_dt, max_dt))
            elif before_1900 > 0 or after_2100 > 0:
                status = f"WARN: {before_1900} before 1900, {after_2100} after 2100"
                invalid_dates_found.append((table, col, before_1900 + after_2100, min_dt, max_dt))
            
            print(f"{col}: min={min_dt}, max={max_dt} - {status}")
            
        except Exception as e:
            print(f"{col}: ERROR - {str(e)[:100]}")


WH_LOANS - DATE COLUMN VALIDATION


  df = pd.read_sql(f"""


RUNDATE: min=2018-08-06 00:00:00, max=2025-12-01 00:00:00 - OK
NEXTRATECHG: min=1996-08-13 00:00:00, max=2045-08-22 00:00:00 - OK
LASTPMTCHGDATE: min=1979-02-09 00:00:00, max=2030-09-14 00:00:00 - OK
LASTINTCHGDATE: min=1979-02-09 00:00:00, max=2025-12-01 00:00:00 - OK
ORIGDATE: min=1979-02-09 12:01:00, max=2025-12-01 15:55:28 - OK
LASTDISBURSDATE: min=1996-12-31 00:00:00, max=2025-12-02 00:00:00 - OK
DATELASTMAINT: min=2018-08-07 00:03:08, max=2025-12-01 21:26:17 - OK
INTPAIDTODATE: ERROR - Execution failed on sql '
                SELECT 
                    COUNT(*) as total_rows,
      

WH_ACCTCOMMON - DATE COLUMN VALIDATION
EFFDATE: min=2018-08-06 00:00:00, max=2025-12-01 00:00:00 - OK
CURRACCTSTATEFFDATE: min=1977-12-01 12:04:00, max=2025-12-01 20:23:42 - OK
NOTENEXTRATECHANGEDATE: min=1996-08-13 00:00:00, max=2045-08-22 00:00:00 - OK
DATEMAT: min=0209-12-31 00:00:00, max=2113-06-27 00:00:00 - WARN: 1 before 1900, 10862 after 2100


In [None]:
# Summary of problematic date columns
print("\n" + "="*60)
print("PROBLEMATIC DATE COLUMNS SUMMARY")
print("="*60)

if invalid_dates_found:
    for table, col, count, min_dt, max_dt in invalid_dates_found:
        print(f"\n{table}.{col}:")
        print(f"  Invalid rows: {count}")
        print(f"  Min date: {min_dt}")
        print(f"  Max date: {max_dt}")
else:
    print("No problematic date columns found!")

## 4. SAMPLE BAD DATE ROWS (if any found)

In [None]:
# Get sample rows with bad dates
for table, col, count, min_dt, max_dt in invalid_dates_found:
    print(f"\n{'='*60}")
    print(f"{table} - SAMPLE ROWS WITH BAD {col}")
    print(f"{'='*60}")
    
    try:
        # Get first few columns + the bad date column
        df = pd.read_sql(f"""
            SELECT *
            FROM {SCHEMA}.{table}
            WHERE EXTRACT(YEAR FROM {col}) < 1900 
               OR EXTRACT(YEAR FROM {col}) > 2100
            FETCH FIRST 10 ROWS ONLY
        """, conn)
        print(df.to_string())
    except Exception as e:
        print(f"ERROR: {e}")

## 5. NUMBER COLUMN PRECISION CHECK

In [None]:
# Check NUMBER columns for precision issues
for table in TABLES:
    print(f"\n{'='*60}")
    print(f"{table} - NUMBER COLUMNS WITH NULL PRECISION (unlimited)")
    print(f"{'='*60}")
    
    schema_df = table_schemas[table]
    num_cols = schema_df[(schema_df['DATA_TYPE'] == 'NUMBER') & (schema_df['DATA_PRECISION'].isna())]
    
    if len(num_cols) > 0:
        print("These columns have unlimited precision (potential overflow risk):")
        print(num_cols[['COLUMN_NAME', 'DATA_TYPE', 'DATA_PRECISION', 'DATA_SCALE']].to_string())
    else:
        print("All NUMBER columns have defined precision - OK")

In [None]:
# Check actual value ranges for NUMBER columns
for table in TABLES:
    print(f"\n{'='*60}")
    print(f"{table} - NUMBER VALUE RANGES")
    print(f"{'='*60}")
    
    num_cols = table_schemas[table][table_schemas[table]['DATA_TYPE'] == 'NUMBER']['COLUMN_NAME'].tolist()
    
    for col in num_cols[:10]:  # Limit to first 10 to avoid long runtime
        try:
            df = pd.read_sql(f"""
                SELECT MIN({col}) as min_val, MAX({col}) as max_val
                FROM {SCHEMA}.{table}
                WHERE {col} IS NOT NULL
            """, conn)
            min_v = df['MIN_VAL'].iloc[0]
            max_v = df['MAX_VAL'].iloc[0]
            print(f"{col}: min={min_v}, max={max_v}")
        except Exception as e:
            print(f"{col}: ERROR - {str(e)[:50]}")

## 6. WH_LOANS SPECIFIC - DEEP DIVE

In [None]:
# WH_LOANS deep dive - check all columns
print("WH_LOANS - FULL COLUMN LIST WITH SAMPLE VALUES")
print("="*60)

try:
    df = pd.read_sql(f"""
        SELECT * FROM {SCHEMA}.WH_LOANS
        FETCH FIRST 5 ROWS ONLY
    """, conn)
    print(f"\nColumns ({len(df.columns)}):")
    for i, col in enumerate(df.columns):
        print(f"  {i+1}. {col}")
    print("\nSample data:")
    print(df.T)  # Transpose for readability
except Exception as e:
    print(f"ERROR: {e}")

## 7. FINAL SUMMARY

In [None]:
print("\n" + "="*70)
print("DIAGNOSTIC COMPLETE")
print("="*70)
print(f"\nRun at: {datetime.now()}")

print("\n--- PROBLEMATIC DATE COLUMNS ---")
if invalid_dates_found:
    for table, col, count, min_dt, max_dt in invalid_dates_found:
        print(f"  {table}.{col}: {count} bad rows (min={min_dt}, max={max_dt})")
else:
    print("  None found")

print("\n--- NEXT STEPS ---")
print("1. Push this notebook output back to repo")
print("2. Use output to build safe CopyJob queries")
print("3. Apply CASE WHEN to handle bad date columns")

In [None]:
conn.close()
print("Connection closed.")