# Pipeline for Data Preprocessing

## 0. Import and Configuration

In [1]:
import sys
import os
import pandas as pd
import importlib.util

# Import data cleaning module
sys.path.insert(0, os.path.abspath('./01_data_cleaning'))
spec = importlib.util.spec_from_file_location(
    "data_cleaning",
    "./01_data_cleaning/01_data_cleaning.py"
)
dc = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dc)

# Import fraud relabeling module
spec = importlib.util.spec_from_file_location(
    "fraud_relabeling",
    "./02_fraud_relabeling/02_fraud_relabeling.py"
)
fr = importlib.util.module_from_spec(spec)
spec.loader.exec_module(fr)

# Import feature engineering module
spec = importlib.util.spec_from_file_location(
    "feature_engineering",
    "./03_feature_engineering/03_feature_engineering.py"
)
fe = importlib.util.module_from_spec(spec)
spec.loader.exec_module(fe)

# Configure data cleaning
dc.ENABLE_RENAMING = True
dc.RAW_DIR = '../../data/raw'
dc.CLEANED_DIR = '../../data/cleaned'

# Configure fraud relabeling
fr.INPUT_DIR = '../../data/cleaned'
fr.OUTPUT_MEMBER_DIR = '../../data/by_member'
fr.OUTPUT_PROCESSED_DIR = '../../data/processed'
fr.CHUNKSIZE = 50000

# Configure feature engineering
fe.PROCESSED_DIR = '../../data/processed'
fe.MODEL_NAME = 'prajjwal1/bert-tiny'
fe.TEXT_COLUMN = 'Transaction Description'
fe.BATCH_SIZE = 64
fe.MAX_LENGTH = 64
fe.PCA_DIM = 20
fe.MIN_K = 10
fe.MAX_K = 60
fe.K_STEP = 10
fe.SAMPLE_SIZE = 10000
fe.CLUSTER_BATCH_SIZE = 4096
fe.RANDOM_STATE = 42

# Create directories if they don't exist
directories = [
    dc.RAW_DIR,
    dc.CLEANED_DIR,
    fr.INPUT_DIR,
    fr.OUTPUT_MEMBER_DIR,
    fr.OUTPUT_PROCESSED_DIR
]

for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

Created directory: ../../data/cleaned
Created directory: ../../data/by_member
Created directory: ../../data/processed


## 1. Data Cleaning
This cell performs the following preprocessing tasks:
1. Standardize headers (e.g., "AccountID" → "Account ID")
2. Fix comma issues (remove extra commas in field values)
3. Clean Amount field (remove $ and commas, convert to numeric)
4. Fill missing values (Amount→0, others→"Unknown", "null"→empty)
5. Rename files based on date range (MM-DD-YYYY-MM-DD-YYYY.csv)

Output: Cleaned and renamed CSV files saved to ../../data/cleaned/


In [2]:
dc.main()

Raw: ../../data/raw
Cleaned: ../../data/cleaned

Found 1 CSV files in ../../data/raw

CSV Files List ⬇️
  1. TransactionData_2025Q1.csv (298.68 MB)

Processing Files...

[1/1] TransactionData_2025Q1.csv... Missing:100165, Newlines:2189448, →01-01-2025_to_03-31-2025.csv

Processing Complete!


### Cleaned File Analysis

#### Size & Rows & Date Span

In [3]:
CLEANED_DIR = dc.CLEANED_DIR
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

# Load all dataframes
dfs = {}
for filename in csv_files:
    dfs[filename] = pd.read_csv(os.path.join(CLEANED_DIR, filename))

# Collect stats
stats = []
for filename, df in dfs.items():
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    min_date = df['Post Date'].min()
    max_date = df['Post Date'].max()
    fraud_count = ((df['Fraud Adjustment Indicator'].notna()) &
                   (df['Fraud Adjustment Indicator'] != '')).sum()

    stats.append({
        'File': filename,
        'Rows': len(df),
        'Members': df['Member ID'].nunique(),
        'Date From': min_date.strftime('%m/%d/%Y') if pd.notna(min_date) else 'N/A',
        'Date To': max_date.strftime('%m/%d/%Y') if pd.notna(max_date) else 'N/A',
        'Days': (max_date - min_date).days if pd.notna(min_date) else 0,
        'Fraud %': round(fraud_count / len(df) * 100, 4) if len(df) > 0 else 0
    })

# Display table
df_stats = pd.DataFrame(stats)
display(df_stats)

# Summary
all_members = set()
for df in dfs.values():
    all_members.update(df['Member ID'].dropna())

total_fraud = sum([((dfs[f]['Fraud Adjustment Indicator'].notna()) &
                    (dfs[f]['Fraud Adjustment Indicator'] != '')).sum()
                   for f in dfs.keys()])

print(f"\nTotal Rows: {df_stats['Rows'].sum():,}")
print(f"Total Unique Members: {len(all_members):,}")
print(f"Total Fraud Indicators: {total_fraud:,}")
print(f"Overall Fraud %: {round(total_fraud / df_stats['Rows'].sum() * 100, 4)}%")

Unnamed: 0,File,Rows,Members,Date From,Date To,Days,Fraud %
0,01-01-2025_to_03-31-2025.csv,2189448,25572,01/01/2025,03/31/2025,89,0.0153



Total Rows: 2,189,448
Total Unique Members: 25,572
Total Fraud Indicators: 334
Overall Fraud %: 0.0153%


#### Overlapping detection

In [4]:
CLEANED_DIR = dc.CLEANED_DIR
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

print(f"Loading {len(csv_files)} files...")

# Load files and create row IDs
file_rows = {}
for filename in csv_files:
    df = pd.read_csv(os.path.join(CLEANED_DIR, filename))
    row_ids = set(df['Account ID'].astype(str) + '|' +
                  df['Member ID'].astype(str) + '|' +
                  df['Post Date'].astype(str) + '|' +
                  df['Post Time'].astype(str) + '|' +
                  df['Amount'].astype(str))
    file_rows[filename] = row_ids

# Calculate pairwise overlaps
results = []
for i, file1 in enumerate(csv_files):
    for j, file2 in enumerate(csv_files):
        if i < j:
            overlap = len(file_rows[file1] & file_rows[file2])
            pct1 = overlap / len(file_rows[file1]) * 100
            pct2 = overlap / len(file_rows[file2]) * 100

            results.append({
                'File 1': file1,
                'File 2': file2,
                'Overlap Rows': overlap,
                '% of File 1': round(pct1, 1),
                '% of File 2': round(pct2, 1)
            })

df_results = pd.DataFrame(results)

# Display as styled DataFrame
display(df_results)

Loading 1 files...


## 2. Fraud Matching and Re-lable
This cell performs fraud detection in two stages:

1. **Reorganize by Member**: Group all transactions by Member ID into individual files
2. **Match Fraud Adjustments**: Find and mark original fraudulent transactions for each refund record （>10）
   - Match by amount and date (extract from description or 30-day range)
   - Prevent duplicate matching
   - Categorize as matched/unmatched/no_fraud

Output: Processed member files saved to `../../data/processed/[matched|unmatched|no_fraud]/`

### Stage 1: Reorganize transactions by member

In [7]:
num_members = fr.run_stage1()

STAGE 1: DATA REORGANIZATION
Input: ../../data/cleaned
Output: ../../data/by_member

Found 1 files
Processing 1/1: 01-01-2025_to_03-31-2025.csv
Created 25572 member files
Sorting files...
  Sorted 100/25572 files
  Sorted 200/25572 files
  Sorted 300/25572 files
  Sorted 400/25572 files
  Sorted 500/25572 files
  Sorted 600/25572 files
  Sorted 700/25572 files
  Sorted 800/25572 files
  Sorted 900/25572 files
  Sorted 1000/25572 files
  Sorted 1100/25572 files
  Sorted 1200/25572 files
  Sorted 1300/25572 files
  Sorted 1400/25572 files
  Sorted 1500/25572 files
  Sorted 1600/25572 files
  Sorted 1700/25572 files
  Sorted 1800/25572 files
  Sorted 1900/25572 files
  Sorted 2000/25572 files
  Sorted 2100/25572 files
  Sorted 2200/25572 files
  Sorted 2300/25572 files
  Sorted 2400/25572 files
  Sorted 2500/25572 files
  Sorted 2600/25572 files
  Sorted 2700/25572 files
  Sorted 2800/25572 files
  Sorted 2900/25572 files
  Sorted 3000/25572 files
  Sorted 3100/25572 files
  Sorted 3200/2

In [8]:
from glob import glob

# Configuration
BY_MEMBER_DIR = '../../data/by_member'
n = 10  # Threshold

# Get all member files and count transactions
member_files = glob(os.path.join(BY_MEMBER_DIR, 'member_*.csv'))
counts = [len(pd.read_csv(f)) for f in member_files]

# Calculate statistics
total_count = len(counts)
above_n = sum(1 for c in counts if c >= n)
below_n = total_count - above_n
above_ratio = (above_n / total_count) * 100
below_ratio = (below_n / total_count) * 100

# Print results
print(f"Threshold set to: {n}")
print(f"Records >= {n}: {above_n:,} ({above_ratio:.2f}%)")
print(f"Records < {n}: {below_n:,} ({below_ratio:.2f}%)")

Threshold set to: 10
Records >= 10: 16,607 (64.94%)
Records < 10: 8,965 (35.06%)


### Stage 2: Fraud detection and matching
above minimum history length filter

In [9]:
# Stage 2: Fraud detection with minimum history length filter
min_history_length = 10
stats = fr.run_stage2(min_history_length)

STAGE 2: FRAUD DETECTION
Input: ../../data/by_member
Output: ../../data/processed
Min History Length: 10

Found 25572 member files
Filtering: only processing members with >= 10 transactions
  Processed 100/25572 members
  Processed 200/25572 members
  Processed 300/25572 members
  Processed 400/25572 members
  Processed 500/25572 members
  Processed 600/25572 members
  Processed 700/25572 members
  Processed 800/25572 members
  Processed 900/25572 members
  Processed 1000/25572 members
  Processed 1100/25572 members
  Processed 1200/25572 members
  Processed 1300/25572 members
  Processed 1400/25572 members
  Processed 1500/25572 members
  Processed 1600/25572 members
  Processed 1700/25572 members
  Processed 1800/25572 members
  Processed 1900/25572 members
  Processed 2000/25572 members
  Processed 2100/25572 members
  Processed 2200/25572 members
  Processed 2300/25572 members
  Processed 2400/25572 members
  Processed 2500/25572 members
  Processed 2600/25572 members
  Processed 2

## 3. Feature Engineering: Description Encoding and Clustering

This stage performs advanced feature engineering on transaction descriptions:

1. **BERT Encoding**: Use BERT-tiny model to encode "Transaction Description" text into embeddings
2. **Dimensionality Reduction**: Apply PCA to reduce embedding dimensions (default: 20D)
3. **Automatic Clustering**: Find optimal cluster count (k) via heuristic search and cluster with MiniBatchKMeans
4. **Add Cluster ID**: Append `cluster_id` column to each CSV file

**Input**: Processed member files from `../../data/processed/[matched|unmatched|no_fraud]/`  
**Output**: Clustered files saved to `../../data/clustered_out/[matched|unmatched|no_fraud]/`

**Note**: This step requires GPU/CPU compute and may take significant time depending on data size.

In [12]:
# Run Stage 3: Description Encoding and Clustering
outputs = fe.main()

STAGE 3: DESCRIPTION ENCODING AND CLUSTERING
Input: ../../data/processed
Model: prajjwal1/bert-tiny
Text Column: Transaction Description
PCA Dimensions: 20
Cluster Range: 10-60 (step 10)

[Found] 16607 CSV files total.
[Device] Using mps


Encoding: 1it [00:00, 100.79it/s]

[PCA] Reducing to 20 dimensions...





ValueError: n_samples=25 should be >= n_clusters=60.

### Results Analysis

In [None]:
from glob import glob
import os

# Check output files
CLUSTERED_DIR = '../../data/clustered_out'
subfolders = ['matched', 'unmatched', 'no_fraud']

print("=" * 60)
print("STAGE 3 OUTPUT SUMMARY")
print("=" * 60)

total_files = 0
stats_list = []

for subfolder in subfolders:
    subfolder_path = os.path.join(CLUSTERED_DIR, subfolder)
    
    if not os.path.exists(subfolder_path):
        print(f"\n{subfolder}: Directory not found")
        continue
    
    csv_files = glob(os.path.join(subfolder_path, 'member_*.csv'))
    
    if csv_files:
        # Sample one file to check cluster_id column
        sample_df = pd.read_csv(csv_files[0])
        has_cluster_id = 'cluster_id' in sample_df.columns
        
        if has_cluster_id:
            # Get cluster statistics from sample
            cluster_counts = sample_df['cluster_id'].value_counts().sort_index()
            num_clusters = len(cluster_counts)
        else:
            num_clusters = "N/A"
        
        stats_list.append({
            'Category': subfolder,
            'Files': len(csv_files),
            'Has cluster_id': '✓' if has_cluster_id else '✗',
            'Clusters (sample)': num_clusters if has_cluster_id else "N/A"
        })
        
        total_files += len(csv_files)
        print(f"\n{subfolder}:")
        print(f"  Files processed: {len(csv_files)}")
        if has_cluster_id:
            print(f"  Cluster ID column: ✓")
            print(f"  Clusters in sample file: {num_clusters}")
    else:
        print(f"\n{subfolder}: No files found")

print(f"\n{'=' * 60}")
print(f"Total files with cluster_id: {total_files}")
print(f"{'=' * 60}")

# Display summary table
if stats_list:
    df_summary = pd.DataFrame(stats_list)
    display(df_summary)

---

**Next Steps**: After Stage 3 completes, you can proceed to Stage 4 (Feature Encoding) using `04_encoding/encoding.py` to encode categorical features and prepare the final dataset for model training.