In [1]:
# Import modules
import pandas as pd
import google.generativeai as genai
import json

In [2]:
# Load Phase 1 metadata
transcripts_df = pd.read_csv('/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/attempt_20250711/ato_transcripts_metadata.csv')

In [3]:
# Set Gemini API key from environment variable
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
if not GOOGLE_API_KEY:
    raise ValueError("Set GOOGLE_API_KEY in ~/.zshrc")
genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
# Initialize Gemini 2.5 Flash
model = genai.GenerativeModel('gemini-2.5-flash')

In [None]:
# Prompt for generating modus operandi
mo_prompt_template = (
    "Analyze the following UK bank call transcript for an Account Takeover (ATO) fraud case: '{transcript}'. "
    "Generate a concise (1-2 sentences) description of the fraud modus operandi, focusing on how the fraudster gained access to the customer's credit card account (e.g., phishing, credential stuffing, social engineering) and their actions (e.g., changing details, unauthorized transactions). "
    "Ensure alignment with UK banking context (e.g., Faster Payments, UK Finance)."
    "Think deeply and generate the modus operandi and avoid using markdown symbols like asterisks (*) and keep it simple ."
)

In [None]:
# Prompt for similarity score
similarity_prompt_template = (
    "Compare the following two fraud modus operandi descriptions for similarity: "
    "Generated: '{generated_mo}' "
    "Provided: '{provided_mo}'. "
    "Return a similarity score between 0 and 1, where 1 is identical and 0 is completely different, based on semantic content. "
    "Provide only the numerical score (e.g., 0.85)."
)

In [None]:
# Process transcripts and generate MO
analysis_data = []
for index, row in transcripts_df.iterrows():
    transcript_id = row['Transcript_ID']
    file_path = row['File_Path']
    provided_mo = row['Fraud_Modus_Operandi']
    
    # Read transcript
    with open(file_path, 'r') as f:
        transcript = f.read()
    
    # Generate modus operandi
    mo_prompt = mo_prompt_template.format(transcript=transcript)
    mo_response = model.generate_content(mo_prompt, generation_config={'max_output_tokens': 5000, 'temperature': 0.7})
    generated_mo = mo_response.text.strip()
    
    # Compute similarity score using Gemini
    similarity_prompt = similarity_prompt_template.format(generated_mo=generated_mo, provided_mo=provided_mo)
    similarity_response = model.generate_content(similarity_prompt, generation_config={'max_output_tokens': 2000, 'temperature': 0.7})
    similarity_score = float(similarity_response.text.strip())

    print("Similarity Score for ID: {} is ".format(transcript_id), similarity_score)
    
    # Store results
    analysis_data.append({
        'Transcript_ID': transcript_id,
        'Generated_Modus_Operandi': generated_mo,
        'Provided_Modus_Operandi': provided_mo,
        'Similarity_Score': similarity_score
    })

In [None]:
# Save results to CSV
analysis_df = pd.DataFrame(analysis_data)
analysis_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/attempt_20250711/ato_modus_operandi.csv', index=False)

In [None]:
analysis_df

## Feature Recommendation using Generated Modus Operandi

In [5]:
# Load modus operandi from Step 1
mo_df = pd.read_csv('/Users/shubhadeepdas/Documents/data_science/projects/genai_transcript/output/attempt_20250711/ato_modus_operandi.csv')
mo_df

Unnamed: 0,Transcript_ID,Generated_Modus_Operandi,Provided_Modus_Operandi,Similarity_Score
0,ATO_Case_1,The fraudster executed a sophisticated phishin...,Fraudster sends a sophisticated phishing email...,0.98
1,ATO_Case_2,"The fraudster used social engineering, posing ...","Posing as bank fraud prevention, a fraudster c...",0.98
2,ATO_Case_3,Leveraging credentials obtained from a third-p...,Fraudsters obtain a list of breached credentia...,0.98
3,ATO_Case_4,The fraudster initiated an account takeover by...,"The customer unknowingly downloads malware, su...",0.85
4,ATO_Case_5,This Account Takeover fraud involved fraudster...,"Through social engineering, a fraudster convin...",0.98


In [6]:
# Raw variables available
raw_variables = [
    'transaction_id', 'transaction_date', 'transaction_time', 'transaction_amt', 'mcc', 'pos',
    'cnp_flag', 'secure_flag', 'merchant_name', 'merchant_id', 'merchant_state_code', 'merchant_cntry_code',
    'digital_code', 'event_date', 'event_time'
]

In [22]:
# Phase 1: Prompt for generating advanced feature recommendations in Python dictionary format
feature_prompt_template = (
    "Analyze the following Account Takeover (ATO) fraud modus operandi from a UK bank call transcript: '{mo}'. "
    "Using the raw variables {raw_vars}, recommend 2-3 sophisticated features for a fraud detection model to prevent missed frauds. "
    "Each feature must address why the fraud was missed (e.g., gaps in detecting unusual login patterns or transaction behaviors). "
    "Return a list of dictionaries where each dictionary has: "
    """"transcript_id"""": '{transcript_id}', "
    """"generated_modus_operandi"""": '{mo}', "
    """"new_feature_name"""": unique and descriptive name, "
    """"description"""": explain what the feature does and how it detects fraud, "
    """"required_raw_variables"""": comma-separated list of variables from the provided list, "
    """"remark"""": justify how the feature prevents the missed fraud based on the MO. "
    "Ensure alignment with UK banking context (e.g., Faster Payments, sort code). "
    "Return only the list of dictionaries"
    "Avoid any special character, escape character, new line character in the output, no heading just simple list."
    "Output should be in a format that can directly be used to convert it into a dataframe without any manual data cleaning"
)

In [23]:
# Generate feature recommendations for all modus operandi
feature_df = []
text_dict = {}
for index, row in mo_df.iterrows():
    transcript_id = row['Transcript_ID']
    generated_mo = row['Generated_Modus_Operandi']

    print("Now generating: {}".format(transcript_id))
    
    # Generate feature recommendations
    feature_prompt = feature_prompt_template.format(transcript_id=transcript_id, mo=generated_mo, raw_vars=raw_variables)
    feature_response = model.generate_content(feature_prompt, generation_config={'max_output_tokens': 15000, 'temperature': 0.7})
    feature_text = feature_response.text.strip()

    text_dict[index] = feature_text
    
    try:
        data = json.loads(feature_text)
        # Convert to a temporary DataFrame
        temp_df = pd.DataFrame(data)
        
        # Append the temporary DataFrame to the list
        feature_df.append(temp_df)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON string: {e}")
        continue

Now generating: ATO_Case_1
Now generating: ATO_Case_2
Now generating: ATO_Case_3
Now generating: ATO_Case_4
Now generating: ATO_Case_5


In [24]:
# Concatenate all temporary DataFrames into a master DataFrame
if feature_df:
    master_df = pd.concat(feature_df, ignore_index=True)
else:
    print("No valid DataFrames to concatenate.")
    master_df = pd.DataFrame()

In [25]:
# Clean the master DataFrame if it's not empty
if not master_df.empty:
    # 1. Remove backslashes from string columns (e.g., replace \' with ')
    master_df = master_df.apply(lambda x: x.str.replace(r"\'", "'", regex=True) if x.dtype == "object" else x)
    
    # 2. Strip any leading/trailing whitespace from string columns
    master_df = master_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    
    # 3. Ensure no missing values (replace NaN with empty string for string columns)
    master_df = master_df.fillna('')
    
    # 4. Convert required_raw_variables to a list for easier programmatic use
    master_df['required_raw_variables'] = master_df['required_raw_variables'].apply(
        lambda x: x.split(',') if isinstance(x, str) and x else x
    )
else:
    print("Master DataFrame is empty.")

master_df

Unnamed: 0,transcript_id,generated_modus_operandi,new_feature_name,description,required_raw_variables,remark
0,ATO_Case_1,The fraudster executed a sophisticated phishin...,Recent_Account_Phone_Update_Flag,This feature is a binary flag that indicates w...,"[event_date, event_time, digital_code, transac...",The fraud was missed because the critical admi...
1,ATO_Case_1,The fraudster executed a sophisticated phishin...,Time_Delta_From_Critical_Event_To_Transaction,This feature calculates the time difference (e...,"[transaction_date, transaction_time, event_dat...","The MO states 'Subsequently, they used this ac..."
2,ATO_Case_1,The fraudster executed a sophisticated phishin...,Txn_Amt_Outlier_After_Phone_Update,This feature identifies if the current 'transa...,"[transaction_amt, mcc, cnp_flag, transaction_d...",The MO mentions 'two significant unauthorised ...
3,ATO_Case_2,"The fraudster used social engineering, posing ...",DigitalWalletProvisioningVelocity,This feature identifies an unusual surge in at...,"[transaction_date, transaction_time, mcc, digi...",The fraud was missed because the initial linki...
4,ATO_Case_2,"The fraudster used social engineering, posing ...",PostDigitalWalletTxnVelocity,This feature measures the velocity and aggrega...,"[transaction_date, transaction_time, transacti...",The fraud was missed because individual unauth...
5,ATO_Case_2,"The fraudster used social engineering, posing ...",FirstPurchaseHighRiskAnomaly,This feature analyzes the characteristics of t...,"[transaction_date, transaction_time, transacti...",The fraud was missed because the model might h...
6,ATO_Case_3,Leveraging credentials obtained from a third-p...,TimeDeltaAccountUpdateToCashAdvance,This feature calculates the time difference (e...,"[event_date, event_time, transaction_date, ...",The fraud was missed because the model likely ...
7,ATO_Case_3,Leveraging credentials obtained from a third-p...,NoveltyOfCashAdvanceAfterEvent,This feature assesses the novelty of a cash ad...,"[transaction_amt, mcc, merchant_name, merch...",The fraud might have been missed if the model ...
8,ATO_Case_3,Leveraging credentials obtained from a third-p...,DigitalChannelHighRiskVelocity,This feature calculates the velocity (count) o...,"[event_date, event_time, transaction_amt, m...",Fraudsters leveraging credential stuffing typi...
9,ATO_Case_4,The fraudster initiated an account takeover by...,MultiChannel_HighValue_Burst_Indicator,This feature identifies a rapid succession of ...,"[transaction_date, transaction_time, transac...",The modus operandi states the fraudster made '...
