In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
def generate_transaction_data(num_claims=200, start_date='2020-01-01', end_date='2023-12-31'):
    np.random.seed(42)
    
    # Generate base claims data first
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    
    claims_base = pd.DataFrame({
        'claim_id': range(1, num_claims + 1),
        'accident_date': pd.date_range(start=start, end=end, periods=num_claims),
        'line_of_business': np.random.choice(['Auto', 'Property', 'Liability'], 
                                           size=num_claims, p=[0.5, 0.3, 0.2])
    })
    
    # Generate reporting lags (in days)
    claims_base['reporting_lag'] = np.random.lognormal(mean=3.5, sigma=1.0, size=num_claims)
    claims_base['reporting_lag'] = np.round(claims_base['reporting_lag']).astype(int)
    claims_base['reported_date'] = claims_base.apply(
        lambda x: x['accident_date'] + timedelta(days=x['reporting_lag']), axis=1)
    
    # Generate ultimate claim amounts
    claims_base['ultimate_amount'] = np.random.lognormal(mean=8.5, sigma=0.5, size=num_claims)
    claims_base['ultimate_amount'] = np.round(claims_base['ultimate_amount'], 2)
    
    # Create transaction-level data
    transactions_list = []
    
    for _, claim in claims_base.iterrows():
        # Number of transactions for this claim (2-4 transactions)
        n_transactions = np.random.randint(2, 5)
        
        # Initial case reserve transaction
        initial_reserve = claim['ultimate_amount'] * np.random.uniform(0.8, 1.2)  # Some variation in initial estimate
        
        # Generate transaction dates
        max_settlement_lag = 365 * 2  # Maximum 2 years to settle
        settlement_lag = np.random.randint(30, max_settlement_lag)
        transaction_dates = pd.date_range(
            start=claim['reported_date'],
            end=claim['reported_date'] + timedelta(days=settlement_lag),
            periods=n_transactions
        )
        
        # First transaction (Initial Setup)
        transactions_list.append({
            'claim_id': claim['claim_id'],
            'transaction_date': transaction_dates[0],
            'accident_date': claim['accident_date'],
            'reported_date': claim['reported_date'],
            'transaction_type': 'Initial Reserve',
            'paid_amount': 0,
            'case_reserve': initial_reserve,
            'transaction_number': 1,
            'line_of_business': claim['line_of_business']
        })
        
        # Middle transactions (if any)
        remaining_amount = initial_reserve
        for i in range(1, n_transactions-1):
            partial_payment = remaining_amount * np.random.uniform(0.3, 0.7)
            remaining_amount -= partial_payment
            
            transactions_list.append({
                'claim_id': claim['claim_id'],
                'transaction_date': transaction_dates[i],
                'accident_date': claim['accident_date'],
                'reported_date': claim['reported_date'],
                'transaction_type': 'Partial Payment',
                'paid_amount': partial_payment,
                'case_reserve': remaining_amount,
                'transaction_number': i+1,
                'line_of_business': claim['line_of_business']
            })
        
        # Final transaction (Final Payment)
        transactions_list.append({
            'claim_id': claim['claim_id'],
            'transaction_date': transaction_dates[-1],
            'accident_date': claim['accident_date'],
            'reported_date': claim['reported_date'],
            'transaction_type': 'Final Payment',
            'paid_amount': remaining_amount,
            'case_reserve': 0,
            'transaction_number': n_transactions,
            'line_of_business': claim['line_of_business']
        })
    
    # Create DataFrame from transactions
    transactions_df = pd.DataFrame(transactions_list)
    
    # Sort by claim_id and transaction_date
    transactions_df = transactions_df.sort_values(['claim_id', 'transaction_date'])
    
    # Calculate cumulative paid amounts per claim
    transactions_df['cumulative_paid'] = transactions_df.groupby('claim_id')['paid_amount'].cumsum()
    
    return transactions_df

In [3]:
# Generate sample data
transactions_data = generate_transaction_data(num_claims=200)

# Display sample of transactions for a single claim
sample_claim = transactions_data['claim_id'].iloc[0]
print(f"\nSample transactions for claim_id {sample_claim}:")
print(transactions_data[transactions_data['claim_id'] == sample_claim])

# Save to CSV
transactions_data.to_csv('synthetic_claims_transactions.csv', index=False)


Sample transactions for claim_id 1:
   claim_id transaction_date accident_date reported_date transaction_type  \
0         1       2020-01-18    2020-01-01    2020-01-18  Initial Reserve   
1         1       2020-07-25    2020-01-01    2020-01-18  Partial Payment   
2         1       2021-01-30    2020-01-01    2020-01-18    Final Payment   

   paid_amount  case_reserve  transaction_number line_of_business  \
0     0.000000   6182.321146                   1             Auto   
1  2200.343432   3981.977714                   2             Auto   
2  3981.977714      0.000000                   3             Auto   

   cumulative_paid  
0         0.000000  
1      2200.343432  
2      6182.321146  
