# Pipeline for Data Preprocessing

## 0. Import and Configuration

In [1]:
import sys
import os
import pandas as pd
import importlib.util

# Import data cleaning module
sys.path.insert(0, os.path.abspath('./01_data_cleaning'))
spec = importlib.util.spec_from_file_location(
    "data_cleaning",
    "./01_data_cleaning/01_data_cleaning.py"
)
dc = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dc)

# Import fraud relabeling module
spec = importlib.util.spec_from_file_location(
    "fraud_relabeling",
    "./02_fraud_relabeling/02_fraud_relabeling.py"
)
fr = importlib.util.module_from_spec(spec)
spec.loader.exec_module(fr)

# Configure data cleaning
dc.ENABLE_RENAMING = True
dc.RAW_DIR = '../../data/raw'
dc.CLEANED_DIR = '../../data/cleaned'

# Configure fraud relabeling
fr.INPUT_DIR = '../../data/cleaned'
fr.OUTPUT_MEMBER_DIR = '../../data/by_member'
fr.OUTPUT_PROCESSED_DIR = '../../data/processed'
fr.CHUNKSIZE = 50000

## 1. Data Cleaning
This cell performs the following preprocessing tasks:
1. Standardize headers (e.g., "AccountID" → "Account ID")
2. Fix comma issues (remove extra commas in field values)
3. Clean Amount field (remove $ and commas, convert to numeric)
4. Fill missing values (Amount→0, others→"Unknown", "null"→empty)
5. Rename files based on date range (MM-DD-YYYY-MM-DD-YYYY.csv)

Output: Cleaned and renamed CSV files saved to ../../data/cleaned/


In [11]:
dc.main()

Raw: ../../data/raw
Cleaned: ../../data/cleaned

Found 4 CSV files in ../../data/raw

CSV Files List ⬇️
  1. TransactionData 10-9-25.csv (601.85 MB)
  2. Transaction Data 09232025 (3).csv (63.05 MB)
  3. TransactionData10-3-25.csv (442.34 MB)
  4. Transaction Data (3).csv (66.99 MB)

Processing Files...

[1/4] TransactionData 10-9-25.csv... Amount:94, Missing:250538, Desc_Comma:34342, Fraud_Merged:20, →09-01-2024_to_09-01-2025.csv
[2/4] Transaction Data 09232025 (3).csv... Amount:9, Missing:28068, Desc_Comma:7640, →09-01-2024_to_09-01-2025_v1.csv
[3/4] TransactionData10-3-25.csv... Amount:238, Missing:150928, Desc_Comma:17206, →08-01-2025_to_09-01-2025.csv
[4/4] Transaction Data (3).csv... Amount:604392, Missing:29054, Desc_Comma:8426, →09-01-2024_to_09-01-2025_v2.csv

Processing Complete!

✅ Cleaning complete! Total 4 files
  1. 09-01-2024_to_09-01-2025_v1.csv (63.21 MB)
  2. 09-01-2024_to_09-01-2025_v2.csv (66.48 MB)
  3. 08-01-2025_to_09-01-2025.csv (362.79 MB)
  4. 09-01-2024_to_09

# Cleaned File Analysis

### Size & Rows & Date Span

In [20]:
import pandas as pd
import os

CLEANED_DIR = dc.CLEANED_DIR
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

# Load all dataframes
dfs = {}
for filename in csv_files:
    dfs[filename] = pd.read_csv(os.path.join(CLEANED_DIR, filename))

# Collect stats
stats = []
for filename, df in dfs.items():
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    min_date = df['Post Date'].min()
    max_date = df['Post Date'].max()
    fraud_count = ((df['Fraud Adjustment Indicator'].notna()) &
                   (df['Fraud Adjustment Indicator'] != '')).sum()

    stats.append({
        'File': filename,
        'Rows': len(df),
        'Members': df['Member ID'].nunique(),
        'Date From': min_date.strftime('%m/%d/%Y') if pd.notna(min_date) else 'N/A',
        'Date To': max_date.strftime('%m/%d/%Y') if pd.notna(max_date) else 'N/A',
        'Days': (max_date - min_date).days if pd.notna(min_date) else 0,
        'Fraud %': round(fraud_count / len(df) * 100, 4) if len(df) > 0 else 0
    })

# Display table
df_stats = pd.DataFrame(stats)
display(df_stats)

# Summary
all_members = set()
for df in dfs.values():
    all_members.update(df['Member ID'].dropna())

total_fraud = sum([((dfs[f]['Fraud Adjustment Indicator'].notna()) &
                    (dfs[f]['Fraud Adjustment Indicator'] != '')).sum()
                   for f in dfs.keys()])

print(f"\nTotal Rows: {df_stats['Rows'].sum():,}")
print(f"Total Unique Members: {len(all_members):,}")
print(f"Total Fraud Indicators: {total_fraud:,}")
print(f"Overall Fraud %: {round(total_fraud / df_stats['Rows'].sum() * 100, 4)}%")

Unnamed: 0,File,Rows,Members,Date From,Date To,Days,Fraud %
0,08-01-2025_to_09-01-2025.csv,3248295,91432,08/01/2025,09/01/2025,31,0.0045
1,09-01-2024_to_09-01-2025.csv,5423417,18869,09/01/2024,09/01/2025,365,0.0232
2,09-01-2024_to_09-01-2025_v1.csv,579672,1981,09/01/2024,09/01/2025,365,0.0664
3,09-01-2024_to_09-01-2025_v2.csv,604432,1952,09/01/2024,09/01/2025,365,0.0622



Total Rows: 9,855,816
Total Unique Members: 96,826
Total Fraud Indicators: 2,165
Overall Fraud %: 0.022%


### overlapping detection

In [21]:
import pandas as pd
import os

CLEANED_DIR = dc.CLEANED_DIR
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

print(f"Loading {len(csv_files)} files...")

# Load files and create row IDs
file_rows = {}
for filename in csv_files:
    df = pd.read_csv(os.path.join(CLEANED_DIR, filename))
    row_ids = set(df['Account ID'].astype(str) + '|' +
                  df['Member ID'].astype(str) + '|' +
                  df['Post Date'].astype(str) + '|' +
                  df['Post Time'].astype(str) + '|' +
                  df['Amount'].astype(str))
    file_rows[filename] = row_ids

# Calculate pairwise overlaps
results = []
for i, file1 in enumerate(csv_files):
    for j, file2 in enumerate(csv_files):
        if i < j:
            overlap = len(file_rows[file1] & file_rows[file2])
            pct1 = overlap / len(file_rows[file1]) * 100
            pct2 = overlap / len(file_rows[file2]) * 100

            results.append({
                'File 1': file1,
                'File 2': file2,
                'Overlap Rows': overlap,
                '% of File 1': round(pct1, 1),
                '% of File 2': round(pct2, 1)
            })

df_results = pd.DataFrame(results)

# Display as styled DataFrame
display(df_results)

Loading 4 files...


Unnamed: 0,File 1,File 2,Overlap Rows,% of File 1,% of File 2
0,08-01-2025_to_09-01-2025.csv,09-01-2024_to_09-01-2025.csv,497273,15.6,9.4
1,08-01-2025_to_09-01-2025.csv,09-01-2024_to_09-01-2025_v1.csv,0,0.0,0.0
2,08-01-2025_to_09-01-2025.csv,09-01-2024_to_09-01-2025_v2.csv,0,0.0,0.0
3,09-01-2024_to_09-01-2025.csv,09-01-2024_to_09-01-2025_v1.csv,25234,0.5,4.5
4,09-01-2024_to_09-01-2025.csv,09-01-2024_to_09-01-2025_v2.csv,27465,0.5,4.7
5,09-01-2024_to_09-01-2025_v1.csv,09-01-2024_to_09-01-2025_v2.csv,27745,4.9,4.8


## 2. Fraud Matching and Re-lable
This cell performs fraud detection in two stages:

1. **Reorganize by Member**: Group all transactions by Member ID into individual files
2. **Match Fraud Adjustments**: Find and mark original fraudulent transactions for each refund record
   - Match by amount and date (extract from description or 30-day range)
   - Prevent duplicate matching
   - Categorize as matched/unmatched/no_fraud

Output: Processed member files saved to `../../data/processed/[matched|unmatched|no_fraud]/`

In [3]:
fr.main()

STAGE 1: DATA REORGANIZATION
Input: ../../data/cleaned
Output: ../../data/by_member

Found 4 files
Processing 1/4: 08-01-2025_to_09-01-2025.csv
Processing 2/4: 09-01-2024_to_09-01-2025.csv
Processing 3/4: 09-01-2024_to_09-01-2025_v1.csv
Processing 4/4: 09-01-2024_to_09-01-2025_v2.csv
Created 96826 member files
Sorting files...
  Sorted 100/96826 files
  Sorted 200/96826 files
  Sorted 300/96826 files
  Sorted 400/96826 files
  Sorted 500/96826 files
  Sorted 600/96826 files
  Sorted 700/96826 files
  Sorted 800/96826 files
  Sorted 900/96826 files
  Sorted 1000/96826 files
  Sorted 1100/96826 files
  Sorted 1200/96826 files
  Sorted 1300/96826 files
  Sorted 1400/96826 files
  Sorted 1500/96826 files
  Sorted 1600/96826 files
  Sorted 1700/96826 files
  Sorted 1800/96826 files
  Sorted 1900/96826 files
  Sorted 2000/96826 files
  Sorted 2100/96826 files
  Sorted 2200/96826 files
  Sorted 2300/96826 files
  Sorted 2400/96826 files
  Sorted 2500/96826 files
  Sorted 2600/96826 files
  So