In [1]:
# Import modules
import google.generativeai as genai
import pandas as pd
import os
import random

In [3]:
# Set Gemini API key from environment variable
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
if not GOOGLE_API_KEY:
    raise ValueError("Set GOOGLE_API_KEY in ~/.zshrc")
genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
# Initialize Gemini 2.5 Flash
model = genai.GenerativeModel('gemini-2.5-flash')

# Account Take Over (ATO) Fraud

In [4]:
# Define prompt for generating 5 unique ATO fraud modus operandi
mo_prompt = (
    "Generate exactly 5 concise (1-2 sentences each) descriptions of realistic fraud modus operandi for Account Takeover (ATO) cases in a UK bank, each with a unique characteristic. "
    "Focus on how the fraudster gains access to the customer's credit card account (e.g., phishing, credential stuffing, social engineering) and their actions (e.g., changing details, unauthorized transactions). "
    "Ensure alignment with UK banking context (e.g., Faster Payments, UK Finance). "
    "List each modus operandi clearly as a numbered item (e.g., 1. ..., 2. ...)."
    "Each modus operandi should be unique and different than others. "
    "Avoid using markdown symbols like asterisks (*) and keep it simple ."
)

# Generate 5 unique ATO modus operandi in one API call
mo_response = model.generate_content(mo_prompt, generation_config={'max_output_tokens': 5000, 'temperature': 0.7})
mo_text = mo_response.text.strip()

# Parse the numbered list of modus operandi
mo_list = [line.strip()[2:].strip() for line in mo_text.split('\n') if line.strip().startswith(('1.', '2.', '3.', '4.', '5.'))]

In [14]:
# Generate metadata for each MO
mo_data = []
for i, modus_operandi in enumerate(mo_list, 1):
    # Randomize conversation duration and assign max tokens based on duration
    duration = random.choice([
        'short (~3-5 minutes, ~300-400 words)',
        'medium (~7-10 minutes, ~500-800 words)',
        'long (~10+ minutes, ~900+ words)'
    ])
    max_tokens = {
        'short': random.randint(5000, 7000),
        'medium': random.randint(7000, 9000),
        'long': random.randint(9000, 11000)
    }[duration.split('(')[0].strip().lower()]
    
    # Store metadata
    mo_data.append({
        'Case_ID': f'ATO_Case_{i}',
        'Scenario': 'Account Takeover',
        'Fraud_Modus_Operandi': modus_operandi,
        'Duration': duration.split('(')[0].strip(),
        'Max_Tokens': max_tokens
    })

# Save modus operandi to CSV
mo_df = pd.DataFrame(mo_data)
mo_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/ato_modus_operandi.csv', index=False)

In [5]:
# Read modus operandi CSV
mo_df = pd.read_csv('/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/ato_modus_operandi.csv')

# Print modus operandi data
print("ATO Modus Operandi Metadata:")
mo_df

ATO Modus Operandi Metadata:


Unnamed: 0,Case_ID,Scenario,Fraud_Modus_Operandi,Duration,Max_Tokens
0,ATO_Case_1,Account Takeover,Fraudster sends a sophisticated phishing email...,medium,7141
1,ATO_Case_2,Account Takeover,"Posing as bank fraud prevention, a fraudster c...",medium,7065
2,ATO_Case_3,Account Takeover,Fraudsters obtain a list of breached credentia...,short,5074
3,ATO_Case_4,Account Takeover,"The customer unknowingly downloads malware, su...",long,9582
4,ATO_Case_5,Account Takeover,"Through social engineering, a fraudster convin...",short,5360


In [15]:
# Generic prompt template
prompt_template = (
    "Generate a realistic UK bank call transcript between an agent and a customer for an actual account takeover fraud case with the following fraud modus operandi: '{modus_operandi}'. "
    "The customer reports unusual activity on their credit card account in line with the modus operandi. "
    "The customer is either anxious or frustrated, mentioning suspicious texts or emails (e.g., fake bank alerts). "
    "The agent asks investigative questions (e.g., 'When did you last log in?', 'Have you shared your PIN or OTP?', 'What device do you use?', 'Any unusual transactions?'). "
    "Include UK banking terms (e.g., sort code, Faster Payments, UK Finance) and fraud indicators (e.g., multiple logins, urgency, vague responses). "
    "Keep dialogue natural, professional, and {duration}. "
    "Avoid using markdown symbols like asterisks (*) or any special character in the transcript, just keep a simple A: B: format."
    "Have a proper ending of the call. "
)

# Generate transcripts using CSV data
transcripts_data = []
for index, row in mo_df.iterrows():
    case_id = row['Case_ID']
    modus_operandi = row['Fraud_Modus_Operandi']
    duration = row['Duration']
    max_tokens = row['Max_Tokens']
    
    # Generate transcript using generic prompt with dynamic MO and duration
    prompt = prompt_template.format(modus_operandi=modus_operandi, duration=duration.lower())
    response = model.generate_content(prompt, generation_config={
        'max_output_tokens': max_tokens,
        'temperature': 0.7
    })
    transcript = response.text.strip()

    print("Generating Case: {}".format(case_id))

    # Identify fraud indicators
    fraud_indicators = []
    keywords = ['urgent', 'immediately', 'unknown', 'suspicious', 'unrecognized', 'not me', 'pressure', 'fake']
    for keyword in keywords:
        if keyword.lower() in transcript.lower():
            fraud_indicators.append(keyword)
    
    # Save transcript to file
    file_path = f'/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/attempt_20250711/transcript_{case_id.lower()}.txt'
    with open(file_path, 'w') as f:
        f.write(transcript)
    
    # Store metadata
    transcripts_data.append({
        'Transcript_ID': case_id,
        'Scenario': 'Account Takeover',
        'Duration': duration,
        'Word_Count': len(transcript.split()),
        'File_Path': file_path,
        'Fraud_Indicators': ', '.join(fraud_indicators) if fraud_indicators else 'None',
        'Fraud_Modus_Operandi': modus_operandi
    })

Generating Case: ATO_Case_1
Generating Case: ATO_Case_2
Generating Case: ATO_Case_3
Generating Case: ATO_Case_4
Generating Case: ATO_Case_5


In [16]:
# Save metadata to CSV
transcripts_df = pd.DataFrame(transcripts_data)
transcripts_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/attempt_20250711/ato_transcripts_metadata.csv', index=False)

In [17]:
transcripts_df

Unnamed: 0,Transcript_ID,Scenario,Duration,Word_Count,File_Path,Fraud_Indicators,Fraud_Modus_Operandi
0,ATO_Case_1,Account Takeover,medium,1305,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious, fake",Fraudster sends a sophisticated phishing email...
1,ATO_Case_2,Account Takeover,medium,1031,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious, pressure, fake","Posing as bank fraud prevention, a fraudster c..."
2,ATO_Case_3,Account Takeover,short,882,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, unknown, suspicious, fake",Fraudsters obtain a list of breached credentia...
3,ATO_Case_4,Account Takeover,long,1385,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious, fake","The customer unknowingly downloads malware, su..."
4,ATO_Case_5,Account Takeover,short,1019,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious","Through social engineering, a fraudster convin..."


# Fraud Type: Card Not Present (CNP)
### Step 1: Generate 5 Card Not Present (CNP) Fraud Modus Operandi

In [None]:
# Prompt for generating 5 unique CNP fraud modus operandi
mo_prompt = (
    "Generate 5 unique modus operandi for Card Not Present (CNP) fraud in a UK banking context. "
    "Each modus operandi should describe a distinct method of perpetrating CNP fraud (e.g., stolen card details via phishing, skimming, dark web purchases). "
    "Ensure alignment with UK banking terms (e.g., Faster Payments, sort code) and focus on how fraudsters exploit online or remote transactions. "
    "Return a list of dictionaries, each with: "
    "'Case_ID': unique identifier like 'CNP_Case_1', "
    "'Scenario': 'Card Not Present', "
    "'Fraud_Modus_Operandi': 1-2 sentence description of the fraud method. "
    "Return only the list of dictionaries as a clean JSON string, e.g., [{\"Case_ID\": \"CNP_Case_1\", \"Scenario\": \"Card Not Present\", \"Fraud_Modus_Operandi\": \"...\"}, ...]. "
    "Avoid special characters or newlines in the output values."
)

In [None]:
    "Generate exactly 5 concise (1-2 sentences each) descriptions of realistic fraud modus operandi for Account Takeover (ATO) cases in a UK bank, each with a unique characteristic. "
    "Focus on how the fraudster gains access to the customer's credit card account (e.g., phishing, credential stuffing, social engineering) and their actions (e.g., changing details, unauthorized transactions). "
    "Ensure alignment with UK banking context (e.g., Faster Payments, UK Finance). "
    "List each modus operandi clearly as a numbered item (e.g., 1. ..., 2. ...)."
    "Each modus operandi should be unique and different than others. "
    "Avoid using markdown symbols like asterisks (*) and keep it simple ."