In [None]:
import pandas as pd
import json
from datasets import Dataset, DatasetDict

# Load the processed dataframes from the previous step
# For this script, we assume df_lc, df_hc, and df_ieee are available pandas DataFrames.
# In a real script, you would load them from saved files.

# --- Heuristic Rule-Based Response Generation ---

def generate_lc_response(row):
    """Generates a response for a Lending Club loan application."""
    risk_level = "High" if row['target'] == 1 else "Low"
    justification_points =
    mitigation_strategies =

    # Justification logic
    if row['fico_range_low'] < 660:
        justification_points.append(f"The applicant's FICO score of {row['fico_range_low']} is in the sub-prime range, indicating a higher credit risk.")
    if row['dti'] > 35:
        justification_points.append(f"The debt-to-income ratio of {row['dti']:.1f}% is elevated, suggesting a strained capacity to handle new debt.")
    if row['int_rate'] > 15:
        justification_points.append(f"The loan's interest rate of {row['int_rate']:.2f}% is high, which often reflects a pre-assessed higher risk by the platform.")

    if not justification_points and risk_level == "High":
        justification_points.append("The loan was marked as high-risk based on historical outcome data, despite primary indicators appearing stable.")
    elif not justification_points and risk_level == "Low":
        justification_points.append("The applicant presents a strong credit profile with a good FICO score, manageable DTI, and a reasonable interest rate.")

    # Mitigation logic for high-risk loans
    if risk_level == "High":
        mitigation_strategies.append("Consider denying the loan application due to the high probability of default.")
        mitigation_strategies.append("If approving, require additional collateral to secure the loan.")
        mitigation_strategies.append("Apply risk-based pricing by maintaining or increasing the high interest rate.")
        mitigation_strategies.append("Insert strict covenants into the loan agreement, such as restrictions on incurring further debt.")

    response_json = {
        "risk_level": risk_level,
        "justification": " ".join(justification_points),
        "mitigation_strategies": mitigation_strategies
    }
    return json.dumps(response_json, indent=2)

def generate_ieee_response(row):
    """Generates a response for an IEEE transaction."""
    risk_level = "High" if row['target'] == 1 else "Low"
    justification_points =
    mitigation_strategies =

    # Justification logic
    if row > 1000:
        justification_points.append(f"The transaction amount of ${row:.2f} is significantly large and warrants scrutiny.")
    if row['P_emaildomain'] in ['mail.com', 'protonmail.com', 'anonymous.com']:
        justification_points.append(f"The purchaser's email domain ({row['P_emaildomain']}) is associated with a higher incidence of fraud.")
    if row['card6'] == 'charge card':
        justification_points.append("The use of a charge card can sometimes be linked to fraudulent activities.")

    if not justification_points and risk_level == "High":
        justification_points.append("The transaction was flagged as high-risk by the system based on complex patterns not immediately apparent from basic features.")
    elif not justification_points and risk_level == "Low":
        justification_points.append("The transaction appears normal, with a standard amount and from a trusted domain.")

    # Mitigation logic for high-risk transactions
    if risk_level == "High":
        mitigation_strategies.append("Decline the transaction immediately to prevent financial loss.")
        mitigation_strategies.append("Flag the account for manual review by a fraud analyst.")
        mitigation_strategies.append("Trigger a multi-factor authentication step (e.g., SMS code) to verify the user's identity before proceeding.")

    response_json = {
        "risk_level": risk_level,
        "justification": " ".join(justification_points),
        "mitigation_strategies": mitigation_strategies
    }
    return json.dumps(response_json, indent=2)

# --- Formatting Function ---

def create_instruction_dataset(df, response_generator):
    """Creates a formatted instruction dataset from a dataframe."""

    instruction = "You are an expert AI Risk Analyst. Your task is to evaluate the financial event detailed below. Provide a structured JSON response containing your risk assessment, a clear justification for your conclusion based on the provided data, and a list of actionable mitigation strategies."

    dataset =
    for _, row in df.iterrows():
        # Serialize the row data into a human-readable context string
        context_parts = [f"- {col}: {val}" for col, val in row.items() if col not in ['target']]
        context = "Financial Event Details:\n" + "\n".join(context_parts)

        # Generate the gold-standard response
        response = response_generator(row)

        # Format for SFTTrainer: create a text column with the full prompt
        # We will use a chat template format that models like Llama 3 and Qwen understand.
        text = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{response}<|eot_id|>"
        dataset.append({"text": text})

    return Dataset.from_list(dataset)


# --- Generate and Combine Datasets ---
# Using smaller samples for demonstration purposes
lc_sample = df_lc.sample(n=5000, random_state=42) if not df_lc.empty else pd.DataFrame()
ieee_sample = df_ieee.sample(n=5000, random_state=42) if not df_ieee.empty else pd.DataFrame()

# Create Hugging Face Datasets
instruction_datasets =
if not lc_sample.empty:
    instruction_datasets.append(create_instruction_dataset(lc_sample, generate_lc_response))
if not ieee_sample.empty:
    instruction_datasets.append(create_instruction_dataset(ieee_sample, generate_ieee_response))
# Note: A generator for Home Credit would be added here following the same pattern.

if instruction_datasets:
    # Concatenate all datasets into one
    from datasets import concatenate_datasets
    full_instruction_dataset = concatenate_datasets(instruction_datasets)

    # Create a train/test split
    final_dataset = full_instruction_dataset.train_test_split(test_size=0.1, seed=42)

    print("Instruction dataset created successfully.")
    print(f"Total examples: {len(full_instruction_dataset)}")
    print(f"Training examples: {len(final_dataset['train'])}")
    print(f"Testing examples: {len(final_dataset['test'])}")
    print("\n--- Sample Training Example ---")
    print(final_dataset['train']['text'])
else:
    print("Could not generate instruction dataset as no source data was available.")