In [1]:
# Import modules
import google.generativeai as genai
import pandas as pd
import os
import random
from dotenv import load_dotenv

In [2]:
load_dotenv()

# Set Gemini API key from environment variable
GOOGLE_API_KEY = os.getenv('GEMINI_API_KEY')
if not GOOGLE_API_KEY:
    raise ValueError("Set GOOGLE_API_KEY in ~/.zshrc")
else:
    print("API Key retrieved successfully")
genai.configure(api_key=GOOGLE_API_KEY)

API Key retrieved successfully


In [3]:
# Initialize Gemini 2.5 Flash
model = genai.GenerativeModel('gemini-2.5-flash')

# Account Take Over (ATO) Fraud

In [4]:
# Define prompt for generating 5 unique ATO fraud modus operandi
mo_prompt = (
    "Generate exactly 5 concise (1-2 sentences each) descriptions of realistic fraud modus operandi for Account Takeover (ATO) cases in a UK bank, each with a unique characteristic. "
    "Focus on how the fraudster gains access to the customer's credit card account (e.g., phishing, credential stuffing, social engineering) and their actions (e.g., changing details, unauthorized transactions). "
    "Ensure alignment with UK banking context (e.g., Faster Payments, UK Finance). "
    "List each modus operandi clearly as a numbered item (e.g., 1. ..., 2. ...)."
    "Each modus operandi should be unique and different than others. "
    "Avoid using markdown symbols like asterisks (*) and keep it simple ."
)

# Generate 5 unique ATO modus operandi in one API call
mo_response = model.generate_content(mo_prompt, generation_config={'max_output_tokens': 5000, 'temperature': 0.7})
mo_text = mo_response.text.strip()

# Parse the numbered list of modus operandi
mo_list = [line.strip()[2:].strip() for line in mo_text.split('\n') if line.strip().startswith(('1.', '2.', '3.', '4.', '5.'))]

In [6]:
# Generate metadata for each MO
mo_data = []
for i, modus_operandi in enumerate(mo_list, 1):
    # Randomize conversation duration and assign max tokens based on duration
    duration = random.choice([
        'short (~3-5 minutes, ~300-400 words)',
        'medium (~7-10 minutes, ~500-800 words)',
        'long (~10+ minutes, ~900+ words)'
    ])
    max_tokens = {
        'short': random.randint(5000, 7000),
        'medium': random.randint(7000, 9000),
        'long': random.randint(9000, 11000)
    }[duration.split('(')[0].strip().lower()]
    
    # Store metadata
    mo_data.append({
        'Case_ID': f'ATO_Case_{i}',
        'Scenario': 'Account Takeover',
        'Fraud_Modus_Operandi': modus_operandi,
        'Duration': duration.split('(')[0].strip(),
        'Max_Tokens': max_tokens
    })

# Save modus operandi to CSV
mo_df = pd.DataFrame(mo_data)
mo_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_modus_operandi.csv', index=False)

In [7]:
# Read modus operandi CSV
mo_df = pd.read_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_modus_operandi.csv')

# Print modus operandi data
print("ATO Modus Operandi Metadata:")
mo_df

ATO Modus Operandi Metadata:


Unnamed: 0,Case_ID,Scenario,Fraud_Modus_Operandi,Duration,Max_Tokens
0,ATO_Case_1,Account Takeover,A fraudster uses a sophisticated phishing emai...,long,9870
1,ATO_Case_2,Account Takeover,"Through a vishing call, a fraudster poses as b...",short,5397
2,ATO_Case_3,Account Takeover,Leveraging credentials stolen from a third-par...,long,10293
3,ATO_Case_4,Account Takeover,A fraudster executes a SIM swap attack by conv...,medium,7207
4,ATO_Case_5,Account Takeover,"The customer inadvertently downloads malware, ...",short,6875


In [9]:
# Generic prompt template
prompt_template = (
    "Generate a realistic UK bank call transcript between an agent and a customer for an actual account takeover fraud case with the following fraud modus operandi: '{modus_operandi}'. "
    "The customer reports unusual activity on their credit card account in line with the modus operandi. "
    "The customer is either anxious or frustrated, mentioning suspicious texts or emails (e.g., fake bank alerts). "
    "The agent asks investigative questions (e.g., 'When did you last log in?', 'Have you shared your PIN or OTP?', 'What device do you use?', 'Any unusual transactions?'). "
    "Include UK banking terms (e.g., sort code, Faster Payments, UK Finance) and fraud indicators (e.g., multiple logins, urgency, vague responses). "
    "Keep dialogue natural, professional, and {duration}. "
    "Avoid using markdown symbols like asterisks (*) or any special character in the transcript, just keep a simple A: B: format."
    "Have a proper ending of the call. "
)

# Generate transcripts using CSV data
transcripts_data = []
for index, row in mo_df.iterrows():
    case_id = row['Case_ID']
    modus_operandi = row['Fraud_Modus_Operandi']
    duration = row['Duration']
    max_tokens = row['Max_Tokens']
    
    # Generate transcript using generic prompt with dynamic MO and duration
    prompt = prompt_template.format(modus_operandi=modus_operandi, duration=duration.lower())
    response = model.generate_content(prompt, generation_config={
        'max_output_tokens': max_tokens,
        'temperature': 0.7
    })
    transcript = response.text.strip()

    print("Generating Case: {}".format(case_id))

    # Identify fraud indicators
    fraud_indicators = []
    keywords = ['urgent', 'immediately', 'unknown', 'suspicious', 'unrecognized', 'not me', 'pressure', 'fake']
    for keyword in keywords:
        if keyword.lower() in transcript.lower():
            fraud_indicators.append(keyword)
    
    # Save transcript to file
    file_path = f'/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/transcript_{case_id.lower()}.txt'
    with open(file_path, 'w') as f:
        f.write(transcript)
    
    # Store metadata
    transcripts_data.append({
        'Transcript_ID': case_id,
        'Scenario': 'Account Takeover',
        'Duration': duration,
        'Word_Count': len(transcript.split()),
        'File_Path': file_path,
        'Fraud_Indicators': ', '.join(fraud_indicators) if fraud_indicators else 'None',
        'Fraud_Modus_Operandi': modus_operandi
    })

Generating Case: ATO_Case_1
Generating Case: ATO_Case_2
Generating Case: ATO_Case_3
Generating Case: ATO_Case_4
Generating Case: ATO_Case_5


In [10]:
# Save metadata to CSV
transcripts_df = pd.DataFrame(transcripts_data)
transcripts_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_transcripts_metadata.csv', index=False)

In [11]:
transcripts_df

Unnamed: 0,Transcript_ID,Scenario,Duration,Word_Count,File_Path,Fraud_Indicators,Fraud_Modus_Operandi
0,ATO_Case_1,Account Takeover,long,1521,/Users/shubhadeepdas/Documents/data_science/pr...,"urgent, immediately, suspicious, fake",A fraudster uses a sophisticated phishing emai...
1,ATO_Case_2,Account Takeover,short,810,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious","Through a vishing call, a fraudster poses as b..."
2,ATO_Case_3,Account Takeover,long,1386,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious",Leveraging credentials stolen from a third-par...
3,ATO_Case_4,Account Takeover,medium,1236,/Users/shubhadeepdas/Documents/data_science/pr...,"urgent, immediately, suspicious",A fraudster executes a SIM swap attack by conv...
4,ATO_Case_5,Account Takeover,short,817,/Users/shubhadeepdas/Documents/data_science/pr...,"immediately, suspicious","The customer inadvertently downloads malware, ..."


## Can be replicated for other fraud types