# Pipeline for Data Preprocessing

## 0. Import and Configuration

In [5]:
import sys
import os
import pandas as pd
import importlib.util

# Import data cleaning module
sys.path.insert(0, os.path.abspath('./01_data_cleaning'))
spec = importlib.util.spec_from_file_location(
    "data_cleaning",
    "./01_data_cleaning/01_data_cleaning.py"
)
dc = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dc)

# Import feature engineering module
spec = importlib.util.spec_from_file_location(
    "feature_engineering",
    "02_feature_engineering/02_feature_engineering.py"
)
fe = importlib.util.module_from_spec(spec)
spec.loader.exec_module(fe)

# Import fraud relabeling module
spec = importlib.util.spec_from_file_location(
    "fraud_relabeling",
    "03_fraud_relabeling/03_fraud_relabeling.py"
)
fr = importlib.util.module_from_spec(spec)
spec.loader.exec_module(fr)

# Import encoding module
spec = importlib.util.spec_from_file_location(
    "encoding",
    "04_encoding/04_encoding.py"
)
enc = importlib.util.module_from_spec(spec)
spec.loader.exec_module(enc)

# Configure data cleaning
dc.ENABLE_RENAMING = True
dc.RAW_DIR = '../../data/train/raw'
dc.CLEANED_DIR = '../../data/train/cleaned'

# Configure feature engineering
fe.PROCESSED_DIR = '../../data/train/cleaned'  # Read from cleaned data
fe.MODEL_NAME = 'prajjwal1/bert-tiny'
fe.TEXT_COLUMN = 'Transaction Description'
fe.BATCH_SIZE = 64
fe.MAX_LENGTH = 64
fe.PCA_DIM = 20
fe.MIN_K = 10
fe.MAX_K = 60
fe.K_STEP = 10
fe.SAMPLE_SIZE = 10000
fe.CLUSTER_BATCH_SIZE = 4096
fe.RANDOM_STATE = 42

# Configure fraud relabeling
fr.INPUT_DIR = '../../data/train/clustered_out'  # Read from feature engineering output
fr.OUTPUT_MEMBER_DIR = '../../data/train/by_member/temp'  # Temporary directory
fr.OUTPUT_PROCESSED_DIR = '../../data/train/by_member'  # Output directory with matched/unmatched/no_fraud
fr.CHUNKSIZE = 50000

# Configure encoding
enc.PROCESSED_DIR = '../../data/train/by_member'  # Read from fraud relabeling output
enc.OUTPUT_DIR = '../../data/train/final'
enc.CONFIG_PATH = '../../config/tokenize_dict.json'

# Create directories if they don't exist
directories = [
    dc.RAW_DIR,
    dc.CLEANED_DIR,
    fr.INPUT_DIR,
    fr.OUTPUT_MEMBER_DIR,
    enc.OUTPUT_DIR
]

for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

Created directory: ../../data/train/cleaned
Created directory: ../../data/train/clustered_out
Created directory: ../../data/train/by_member/temp
Created directory: ../../data/train/final


## 1. Data Cleaning

This cell performs the following preprocessing tasks:
1. Standardize headers (e.g., "AccountID" → "Account ID")
2. Fix comma issues (remove extra commas in field values)
3. Clean Amount field (remove $ and commas, convert to numeric)
4. Fill missing values (Amount→0, others→"Unknown", "null"→empty)
5. Rename files based on date range (MM-DD-YYYY-MM-DD-YYYY.csv)

**Input**: Raw CSV files from `../../data/raw/`  
**Output**: Cleaned and renamed CSV files saved to `../../data/cleaned/`

In [6]:
dc.main()

Raw: ../../data/train/raw
Cleaned: ../../data/train/cleaned

Found 3 CSV files in ../../data/train/raw

CSV Files List ⬇️
  1. part03.csv (3.03 MB)
  2. part02.csv (2.97 MB)
  3. part01.csv (2.99 MB)

Processing Files...

[1/3] part03.csv... Amount:27362, Missing:1299, →10-05-2024_to_10-23-2024.csv
[2/3] part02.csv... Amount:27361, Missing:1624, →09-19-2024_to_10-05-2024.csv
[3/3] part01.csv... Amount:27351, Missing:1277, →09-01-2024_to_09-19-2024.csv

Processing Complete!


### Cleaned File Analysis

#### Size & Rows & Date Span

In [7]:
CLEANED_DIR = dc.CLEANED_DIR
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

# Load all dataframes
dfs = {}
for filename in csv_files:
    dfs[filename] = pd.read_csv(os.path.join(CLEANED_DIR, filename))

# Collect stats
stats = []
for filename, df in dfs.items():
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    min_date = df['Post Date'].min()
    max_date = df['Post Date'].max()
    fraud_count = ((df['Fraud Adjustment Indicator'].notna()) &
                   (df['Fraud Adjustment Indicator'] != '')).sum()

    stats.append({
        'File': filename,
        'Rows': len(df),
        'Members': df['Member ID'].nunique(),
        'Date From': min_date.strftime('%m/%d/%Y') if pd.notna(min_date) else 'N/A',
        'Date To': max_date.strftime('%m/%d/%Y') if pd.notna(max_date) else 'N/A',
        'Days': (max_date - min_date).days if pd.notna(min_date) else 0,
        'Fraud %': round(fraud_count / len(df) * 100, 4) if len(df) > 0 else 0
    })

# Display table
df_stats = pd.DataFrame(stats)
display(df_stats)

# Summary
all_members = set()
for df in dfs.values():
    all_members.update(df['Member ID'].dropna())

total_fraud = sum([((dfs[f]['Fraud Adjustment Indicator'].notna()) &
                    (dfs[f]['Fraud Adjustment Indicator'] != '')).sum()
                   for f in dfs.keys()])

print(f"\nTotal Rows: {df_stats['Rows'].sum():,}")
print(f"Total Unique Members: {len(all_members):,}")
print(f"Total Fraud Indicators: {total_fraud:,}")
print(f"Overall Fraud %: {round(total_fraud / df_stats['Rows'].sum() * 100, 4)}%")

Unnamed: 0,File,Rows,Members,Date From,Date To,Days,Fraud %
0,09-01-2024_to_09-19-2024.csv,27363,1116,09/01/2024,09/19/2024,18,0.0365
1,09-19-2024_to_10-05-2024.csv,27363,1446,09/19/2024,10/05/2024,16,0.0073
2,10-05-2024_to_10-23-2024.csv,27363,1129,10/05/2024,10/23/2024,18,0.0731



Total Rows: 82,089
Total Unique Members: 1,490
Total Fraud Indicators: 32
Overall Fraud %: 0.039%


#### Overlapping detection

In [8]:
CLEANED_DIR = dc.CLEANED_DIR
csv_files = sorted([f for f in os.listdir(CLEANED_DIR) if f.endswith('.csv')])

print(f"Loading {len(csv_files)} files...")

# Load files and create row IDs
file_rows = {}
for filename in csv_files:
    df = pd.read_csv(os.path.join(CLEANED_DIR, filename))
    row_ids = set(df['Account ID'].astype(str) + '|' +
                  df['Member ID'].astype(str) + '|' +
                  df['Post Date'].astype(str) + '|' +
                  df['Post Time'].astype(str) + '|' +
                  df['Amount'].astype(str))
    file_rows[filename] = row_ids

# Calculate pairwise overlaps
results = []
for i, file1 in enumerate(csv_files):
    for j, file2 in enumerate(csv_files):
        if i < j:
            overlap = len(file_rows[file1] & file_rows[file2])
            pct1 = overlap / len(file_rows[file1]) * 100
            pct2 = overlap / len(file_rows[file2]) * 100

            results.append({
                'File 1': file1,
                'File 2': file2,
                'Overlap Rows': overlap,
                '% of File 1': round(pct1, 1),
                '% of File 2': round(pct2, 1)
            })

df_results = pd.DataFrame(results)

# Display as styled DataFrame
display(df_results)

Loading 3 files...


Unnamed: 0,File 1,File 2,Overlap Rows,% of File 1,% of File 2
0,09-01-2024_to_09-19-2024.csv,09-19-2024_to_10-05-2024.csv,1,0.0,0.0
1,09-01-2024_to_09-19-2024.csv,10-05-2024_to_10-23-2024.csv,0,0.0,0.0
2,09-19-2024_to_10-05-2024.csv,10-05-2024_to_10-23-2024.csv,0,0.0,0.0


## 2. Feature Engineering: Description Encoding and Clustering

This stage performs advanced feature engineering on transaction descriptions:

1. **BERT Encoding**: Use BERT-tiny model to encode "Transaction Description" text into embeddings
2. **Dimensionality Reduction**: Apply PCA to reduce embedding dimensions (default: 20D)
3. **Automatic Clustering**: Find optimal cluster count (k) via heuristic search and cluster with MiniBatchKMeans
4. **Add Cluster ID**: Append `cluster_id` column to each CSV file

**Input**: Cleaned CSV files from `../../data/cleaned/`  
**Output**: Clustered files saved to `../../data/clustered_out/`

**Note**: This step requires GPU/CPU compute and may take significant time depending on data size.

In [9]:
outputs = fe.main()

STAGE 2: DESCRIPTION ENCODING AND CLUSTERING
Input: ../../data/train/cleaned
Output: /Users/wwy/Documents/CMU/25-Fall/Practicum/Clearshield/data/train/clustered_out
Model: prajjwal1/bert-tiny
Text Column: Transaction Description
PCA Dimensions: 20
Cluster Range: 10-60 (step 10)

[Scan] Found 3 CSV file(s) in ../../data/train/cleaned


                                   

[Done] Saved 3 clustered file(s) to /Users/wwy/Documents/CMU/25-Fall/Practicum/Clearshield/data/train/clustered_out

STAGE 2 COMPLETE
Processed 3 files
Output location: /Users/wwy/Documents/CMU/25-Fall/Practicum/Clearshield/data/train/clustered_out


## 3. Fraud Matching and Re-label
This cell performs fraud detection in two stages:

1. **Reorganize by Member**: Group all transactions by Member ID into individual files (temp directory)
2. **Match Fraud Adjustments**: Find and mark original fraudulent transactions for each refund record (≥10 transactions)
   - Match by amount and date (extract from description or 30-day range)
   - Prevent duplicate matching
   - Categorize as matched/unmatched/no_fraud
   - Automatically delete temp directory after processing

**Input**: Clustered files from `../../data/train/clustered_out/`  
**Output**: Processed member files saved to `../../data/train/by_member/[matched|unmatched|no_fraud]/`

### Stage 3-1: Reorganize transactions by member

**Input**: Clustered files from `../../data/train/clustered_out/`  
**Output**: Member-grouped files saved to `../../data/train/by_member/temp/` (temporary)

In [10]:
num_members = fr.run_stage1()

STAGE 1: DATA REORGANIZATION
Input: ../../data/train/clustered_out
Output: ../../data/train/by_member/temp

Found 3 files
Processing 1/3: 09-01-2024_to_09-19-2024.csv
Processing 2/3: 09-19-2024_to_10-05-2024.csv
Processing 3/3: 10-05-2024_to_10-23-2024.csv
Modified 1490 member files this run
Sorting modified files...
  Sorted 1000/1490 files

1490 member files created



In [11]:
from glob import glob

# Configuration
BY_MEMBER_DIR = '../../data/by_member'
n = 10  # Threshold

# Get all member files and count transactions
member_files = glob(os.path.join(BY_MEMBER_DIR, 'member_*.csv'))
counts = [len(pd.read_csv(f)) for f in member_files]

# Calculate statistics
total_count = len(counts)
above_n = sum(1 for c in counts if c >= n)
below_n = total_count - above_n
above_ratio = (above_n / total_count) * 100
below_ratio = (below_n / total_count) * 100

# Print results
print(f"Threshold set to: {n}")
print(f"Records >= {n}: {above_n:,} ({above_ratio:.2f}%)")
print(f"Records < {n}: {below_n:,} ({below_ratio:.2f}%)")

ZeroDivisionError: division by zero

### Stage 3-2: Fraud detection and matching

Filter members with minimum history length (≥10 transactions), then match fraud adjustments to original transactions.

**Input**: Member-grouped files from `../../data/train/by_member/temp/`  
**Output**: Processed and categorized files saved to `../../data/train/by_member/[matched|unmatched|no_fraud]/`

**Note**: The temp directory will be automatically deleted after processing completes.

In [7]:
# Stage 2: Fraud detection with minimum history length filter
min_history_length = 10
stats = fr.run_stage2(min_history_length)

STAGE 2: FRAUD DETECTION
Input: ../../data/train/by_member/temp
Output: ../../data/train/by_member
Min History Length: 10

Found 0 member files
Filtering: only processing members with >= 10 transactions
Summary saved to: ../../data/train/by_member/member_summary.csv

Processing Summary:
  Total Processed: 0
  Skipped (< 10 txns): 0
  No Fraud: 0
  Matched: 0
  Unmatched: 0

COMPLETE


## 4. Feature Encoding

This stage encodes categorical features and prepares the final dataset for model training:

1. **Remove ID Columns**: Delete Account ID and Member ID
2. **Encode Categorical Features**: Convert categorical columns to numeric using predefined dictionary
   - Account Type, Action Type, Source Type, Product ID
3. **Parse Time Features**: Convert Post Time to decimal hours
4. **Convert Date Features**: Parse Post Date and Account Open Date
5. **Clean Up**: Remove text columns (Transaction Description, Fraud Adjustment Indicator)

**Input**: Processed member files from `../../data/train/by_member/[matched|unmatched|no_fraud]/`  
**Output**: Final encoded files saved to `../../data/train/final/[matched|unmatched|no_fraud]/`

In [20]:
total_processed = enc.encode_features(enc.PROCESSED_DIR, enc.OUTPUT_DIR, enc.CONFIG_PATH)

FEATURE ENCODING
Input Dir: ../../data/train/processed
Output Dir: ../../data/train/final
Config Path: ../../config/tokenize_dict.json

Loaded encoding dictionary with 4 features

matched: Found 96 files
  matched: Encoded 96/96 files

unmatched: Found 17 files
  unmatched: Encoded 17/17 files

no_fraud: Found 1391 files
  Processed 100/1391 files
  Processed 200/1391 files
  Processed 300/1391 files
  Processed 400/1391 files
  Processed 500/1391 files
  Processed 600/1391 files
  Processed 700/1391 files
  Processed 800/1391 files
  Processed 900/1391 files
  Processed 1000/1391 files
  Processed 1100/1391 files
  Processed 1200/1391 files
  Processed 1300/1391 files
  no_fraud: Encoded 1391/1391 files

Encoding Complete!
Total files found: 1504
Total files processed: 1504


TODO : vulnerability scanner

---

## Pipeline Complete!

The complete data preprocessing pipeline consists of 4 stages:

1. **Data Cleaning**: Raw CSV → Cleaned CSV (`data/train/cleaned/`)
2. **Feature Engineering**: Cleaned CSV → Clustered CSV (`data/train/clustered_out/`)
3. **Fraud Matching**: Clustered CSV → Categorized by Member (`data/train/by_member/[matched|unmatched|no_fraud]/`)
4. **Feature Encoding**: Processed CSV → Final Encoded Dataset (`data/train/final/[matched|unmatched|no_fraud]/`)

**Final Output**: `../../data/train/final/[matched|unmatched|no_fraud]/member_*.csv`

These final encoded files are ready for model training!