In [1]:
# Import modules
import pandas as pd
import google.generativeai as genai
import os
from dotenv import load_dotenv
import json

In [2]:
# Load Phase 1 metadata
transcripts_df = pd.read_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_transcripts_metadata.csv')

In [3]:
load_dotenv()

# Set Gemini API key from environment variable
GOOGLE_API_KEY = os.getenv('GEMINI_API_KEY')
if not GOOGLE_API_KEY:
    raise ValueError("Set GOOGLE_API_KEY in ~/.zshrc")
genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
# Initialize Gemini 2.5 Flash
model = genai.GenerativeModel('gemini-2.5-flash')

In [5]:
# Prompt for generating modus operandi
mo_prompt_template = (
    "Analyze the following UK bank call transcript for an Account Takeover (ATO) fraud case: '{transcript}'. "
    "Generate a concise (1-2 sentences) description of the fraud modus operandi, focusing on how the fraudster gained access to the customer's credit card account (e.g., phishing, credential stuffing, social engineering) and their actions (e.g., changing details, unauthorized transactions). "
    "Ensure alignment with UK banking context (e.g., Faster Payments, UK Finance)."
    "Think deeply and generate the modus operandi and avoid using markdown symbols like asterisks (*) and keep it simple ."
)

In [6]:
# Prompt for similarity score
similarity_prompt_template = (
    "Compare the following two fraud modus operandi descriptions for similarity: "
    "Generated: '{generated_mo}' "
    "Provided: '{provided_mo}'. "
    "Return a similarity score between 0 and 1, where 1 is identical and 0 is completely different, based on semantic content. "
    "Provide only the numerical score (e.g., 0.85)."
)

In [7]:
# Process transcripts and generate MO
analysis_data = []
for index, row in transcripts_df.iterrows():
    transcript_id = row['Transcript_ID']
    file_path = row['File_Path']
    provided_mo = row['Fraud_Modus_Operandi']
    
    # Read transcript
    with open(file_path, 'r') as f:
        transcript = f.read()
    
    # Generate modus operandi
    mo_prompt = mo_prompt_template.format(transcript=transcript)
    mo_response = model.generate_content(mo_prompt, generation_config={'max_output_tokens': 5000, 'temperature': 0.7})
    generated_mo = mo_response.text.strip()
    
    # Compute similarity score using Gemini
    similarity_prompt = similarity_prompt_template.format(generated_mo=generated_mo, provided_mo=provided_mo)
    similarity_response = model.generate_content(similarity_prompt, generation_config={'max_output_tokens': 5000, 'temperature': 0.7})
    similarity_score = float(similarity_response.text.strip())

    print("Similarity Score for ID: {} is ".format(transcript_id), similarity_score)
    
    # Store results
    analysis_data.append({
        'Transcript_ID': transcript_id,
        'Generated_Modus_Operandi': generated_mo,
        'Provided_Modus_Operandi': provided_mo,
        'Similarity_Score': similarity_score
    })

Similarity Score for ID: ATO_Case_1 is  0.98
Similarity Score for ID: ATO_Case_2 is  0.98
Similarity Score for ID: ATO_Case_3 is  0.97
Similarity Score for ID: ATO_Case_4 is  0.95
Similarity Score for ID: ATO_Case_5 is  0.8


In [8]:
# Save results to CSV
analysis_df = pd.DataFrame(analysis_data)
analysis_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_modus_operandi.csv', index=False)

In [9]:
analysis_df

Unnamed: 0,Transcript_ID,Generated_Modus_Operandi,Provided_Modus_Operandi,Similarity_Score
0,ATO_Case_1,The fraudster executed a sophisticated phishin...,A fraudster uses a sophisticated phishing emai...,0.98
1,ATO_Case_2,Fraudsters used a vishing call to socially eng...,"Through a vishing call, a fraudster poses as b...",0.98
2,ATO_Case_3,The fraudster gained unauthorised access to th...,Leveraging credentials stolen from a third-par...,0.97
3,ATO_Case_4,Fraudsters obtained the customer's online bank...,A fraudster executes a SIM swap attack by conv...,0.95
4,ATO_Case_5,The fraudster compromised the customer's onlin...,"The customer inadvertently downloads malware, ...",0.8


## Feature Recommendation using Generated Modus Operandi

In [10]:
# Load modus operandi from Step 1
mo_df = pd.read_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_modus_operandi.csv')
mo_df

Unnamed: 0,Transcript_ID,Generated_Modus_Operandi,Provided_Modus_Operandi,Similarity_Score
0,ATO_Case_1,The fraudster executed a sophisticated phishin...,A fraudster uses a sophisticated phishing emai...,0.98
1,ATO_Case_2,Fraudsters used a vishing call to socially eng...,"Through a vishing call, a fraudster poses as b...",0.98
2,ATO_Case_3,The fraudster gained unauthorised access to th...,Leveraging credentials stolen from a third-par...,0.97
3,ATO_Case_4,Fraudsters obtained the customer's online bank...,A fraudster executes a SIM swap attack by conv...,0.95
4,ATO_Case_5,The fraudster compromised the customer's onlin...,"The customer inadvertently downloads malware, ...",0.8


In [11]:
# Raw variables available
raw_variables = [
    'transaction_id', 'transaction_date', 'transaction_time', 'transaction_amt', 'mcc', 'pos',
    'cnp_flag', 'secure_flag', 'merchant_name', 'merchant_id', 'merchant_state_code', 'merchant_cntry_code',
    'digital_code', 'event_date', 'event_time'
]

In [12]:
# Phase 1: Prompt for generating advanced feature recommendations in Python dictionary format
feature_prompt_template = (
    "Analyze the following Account Takeover (ATO) fraud modus operandi from a UK bank call transcript: '{mo}'. "
    "Using the raw variables {raw_vars}, recommend 2-3 sophisticated features for a fraud detection model to prevent missed frauds. "
    "Each feature must address why the fraud was missed (e.g., gaps in detecting unusual login patterns or transaction behaviors). "
    "Return a list of dictionaries where each dictionary has: "
    """"transcript_id"""": '{transcript_id}', "
    """"generated_modus_operandi"""": '{mo}', "
    """"new_feature_name"""": unique and descriptive name, "
    """"description"""": explain what the feature does and how it detects fraud, "
    """"required_raw_variables"""": comma-separated list of variables from the provided list, "
    """"remark"""": justify how the feature prevents the missed fraud based on the MO. "
    "Ensure alignment with UK banking context (e.g., Faster Payments, sort code). "
    "Return only the list of dictionaries"
    "Avoid any special character, escape character, new line character in the output, no heading just simple list."
    "Output should be in a format that can directly be used to convert it into a dataframe without any manual data cleaning"
)

In [13]:
# Generate feature recommendations for all modus operandi
feature_df = []
text_dict = {}
for index, row in mo_df.iterrows():
    transcript_id = row['Transcript_ID']
    generated_mo = row['Generated_Modus_Operandi']

    print("Now generating: {}".format(transcript_id))
    
    # Generate feature recommendations
    feature_prompt = feature_prompt_template.format(transcript_id=transcript_id, mo=generated_mo, raw_vars=raw_variables)
    feature_response = model.generate_content(feature_prompt, generation_config={'max_output_tokens': 30000, 'temperature': 0.7})
    feature_text = feature_response.text.strip()

    text_dict[index] = feature_text
    
    try:
        data = json.loads(feature_text)
        # Convert to a temporary DataFrame
        temp_df = pd.DataFrame(data)
        
        # Append the temporary DataFrame to the list
        feature_df.append(temp_df)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON string: {e}")
        continue

Now generating: ATO_Case_1
Now generating: ATO_Case_2
Now generating: ATO_Case_3
Now generating: ATO_Case_4
Now generating: ATO_Case_5


In [14]:
# Concatenate all temporary DataFrames into a master DataFrame
if feature_df:
    master_df = pd.concat(feature_df, ignore_index=True)
else:
    print("No valid DataFrames to concatenate.")
    master_df = pd.DataFrame()

In [15]:
# Clean the master DataFrame if it's not empty
if not master_df.empty:
    # 1. Remove backslashes from string columns (e.g., replace \' with ')
    master_df = master_df.apply(lambda x: x.str.replace(r"\'", "'", regex=True) if x.dtype == "object" else x)
    
    # 2. Strip any leading/trailing whitespace from string columns
    master_df = master_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    
    # 3. Ensure no missing values (replace NaN with empty string for string columns)
    master_df = master_df.fillna('')
    
    # 4. Convert required_raw_variables to a list for easier programmatic use
    master_df['required_raw_variables'] = master_df['required_raw_variables'].apply(
        lambda x: x.split(',') if isinstance(x, str) and x else x
    )
else:
    print("Master DataFrame is empty.")

In [16]:
master_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_feature_recommendations_2.csv', index=False)

In [17]:
master_df

Unnamed: 0,transcript_id,generated_modus_operandi,new_feature_name,description,required_raw_variables,remark
0,ATO_Case_1,The fraudster executed a sophisticated phishin...,time_compressed_login_payee_transfer_sequence,Detects unusually rapid sequencing of online b...,"[event_date, event_time, transaction_date, ...",This feature directly targets the 'unauthorise...
1,ATO_Case_1,The fraudster executed a sophisticated phishin...,credit_card_faster_payment_source_anomaly,Identifies instances where a Faster Payment tr...,"[mcc, transaction_amt, digital_code]",The fraud was missed because the system might ...
2,ATO_Case_1,The fraudster executed a sophisticated phishin...,high_value_transfer_to_new_payee,Flags transactions where a high-value Faster P...,"[event_date, event_time, transaction_date, ...",The fraud was missed because adding a new paye...
3,ATO_Case_2,Fraudsters used a vishing call to socially eng...,HighValueCNP_After_RecentContactChange,This feature identifies high-value Card-Not-Pr...,"[transaction_date, transaction_time, transac...",The fraud was missed because the contact infor...
4,ATO_Case_2,Fraudsters used a vishing call to socially eng...,NewMerchantVelocity_HighValue_PostATOEvent,This feature calculates the number of distinct...,"[transaction_date, transaction_time, transac...",The model might have missed this fraud if it o...
5,ATO_Case_2,Fraudsters used a vishing call to socially eng...,UncharacteristicMCC_HighValue_PostATOEvent,This feature flags high-value Card-Not-Present...,"[transaction_date, transaction_time, transac...",Fraudsters often purchase specific high-value ...
6,ATO_Case_3,The fraudster gained unauthorised access to th...,LoginDeviceNoveltyAndInactivityDuration,This feature calculates the time elapsed (in h...,"[event_date, event_time, digital_code]",This feature directly addresses the 'credentia...
7,ATO_Case_3,The fraudster gained unauthorised access to th...,NewCardDeliveryAddressAnomalyScore,This feature generates a risk score for the de...,"[event_date, event_time]",This feature directly targets the 'arranged fo...
8,ATO_Case_3,The fraudster gained unauthorised access to th...,RapidHighRiskEventSequenceAfterNewDigitalCodeL...,This feature measures the time difference (in ...,"[event_date, event_time, digital_code]",This feature addresses the rapid sequence of a...
9,ATO_Case_4,Fraudsters obtained the customer's online bank...,Time_Since_Last_Phone_Number_Change_to_First_Tx,"Measures the time duration (e.g., in minutes o...","[transaction_date, transaction_time, cnp_flag,...",This feature directly addresses the SIM swap c...


# Saving the Final Results

In [18]:
mo_df.columns

Index(['Transcript_ID', 'Generated_Modus_Operandi', 'Provided_Modus_Operandi',
       'Similarity_Score'],
      dtype='object')

In [19]:
master_df.columns

Index(['transcript_id', 'generated_modus_operandi', 'new_feature_name',
       'description', 'required_raw_variables', 'remark'],
      dtype='object')

In [20]:
# Standardize column names for merging
analysis_df = mo_df.rename(columns={'Transcript_ID': 'transcript_id', 'Generated_Modus_Operandi': 'generated_modus_operandi', 'Provided_Modus_Operandi': 'provided_modus_operandi', 'Similarity_Score': 'similarity_score'})

In [21]:
# Merge analysis and feature data on transcript_id
final_df = master_df.merge(analysis_df[['transcript_id', 'provided_modus_operandi', 'similarity_score']], 
                           on='transcript_id', 
                           how='left')

In [22]:
# Validate required columns
required_columns = ['transcript_id', 'generated_modus_operandi', 'provided_modus_operandi', 'similarity_score', 
                   'new_feature_name', 'description', 'required_raw_variables', 'remark']
missing_columns = [col for col in required_columns if col not in final_df.columns]
if missing_columns:
    print(f"Warning: Missing columns in final DataFrame: {missing_columns}")

In [23]:
# Handle missing values (fill NaN with empty string for string columns, 0 for similarity_score)
for col in final_df.columns:
    if col == 'similarity_score':
        final_df[col] = final_df[col].fillna(0.0)
    elif final_df[col].dtype == 'object':
        final_df[col] = final_df[col].fillna('')

# Reorder columns
final_df = final_df[required_columns]

In [24]:
# Save final results to CSV
final_df.to_csv('/Users/shubhadeepdas/Documents/data_science/projects/hackaidea/output/ato_final_results.csv', index=False)

In [25]:
# Print results
print("Final Results DataFrame:")
final_df

Final Results DataFrame:


Unnamed: 0,transcript_id,generated_modus_operandi,provided_modus_operandi,similarity_score,new_feature_name,description,required_raw_variables,remark
0,ATO_Case_1,The fraudster executed a sophisticated phishin...,A fraudster uses a sophisticated phishing emai...,0.98,time_compressed_login_payee_transfer_sequence,Detects unusually rapid sequencing of online b...,"[event_date, event_time, transaction_date, ...",This feature directly targets the 'unauthorise...
1,ATO_Case_1,The fraudster executed a sophisticated phishin...,A fraudster uses a sophisticated phishing emai...,0.98,credit_card_faster_payment_source_anomaly,Identifies instances where a Faster Payment tr...,"[mcc, transaction_amt, digital_code]",The fraud was missed because the system might ...
2,ATO_Case_1,The fraudster executed a sophisticated phishin...,A fraudster uses a sophisticated phishing emai...,0.98,high_value_transfer_to_new_payee,Flags transactions where a high-value Faster P...,"[event_date, event_time, transaction_date, ...",The fraud was missed because adding a new paye...
3,ATO_Case_2,Fraudsters used a vishing call to socially eng...,"Through a vishing call, a fraudster poses as b...",0.98,HighValueCNP_After_RecentContactChange,This feature identifies high-value Card-Not-Pr...,"[transaction_date, transaction_time, transac...",The fraud was missed because the contact infor...
4,ATO_Case_2,Fraudsters used a vishing call to socially eng...,"Through a vishing call, a fraudster poses as b...",0.98,NewMerchantVelocity_HighValue_PostATOEvent,This feature calculates the number of distinct...,"[transaction_date, transaction_time, transac...",The model might have missed this fraud if it o...
5,ATO_Case_2,Fraudsters used a vishing call to socially eng...,"Through a vishing call, a fraudster poses as b...",0.98,UncharacteristicMCC_HighValue_PostATOEvent,This feature flags high-value Card-Not-Present...,"[transaction_date, transaction_time, transac...",Fraudsters often purchase specific high-value ...
6,ATO_Case_3,The fraudster gained unauthorised access to th...,Leveraging credentials stolen from a third-par...,0.97,LoginDeviceNoveltyAndInactivityDuration,This feature calculates the time elapsed (in h...,"[event_date, event_time, digital_code]",This feature directly addresses the 'credentia...
7,ATO_Case_3,The fraudster gained unauthorised access to th...,Leveraging credentials stolen from a third-par...,0.97,NewCardDeliveryAddressAnomalyScore,This feature generates a risk score for the de...,"[event_date, event_time]",This feature directly targets the 'arranged fo...
8,ATO_Case_3,The fraudster gained unauthorised access to th...,Leveraging credentials stolen from a third-par...,0.97,RapidHighRiskEventSequenceAfterNewDigitalCodeL...,This feature measures the time difference (in ...,"[event_date, event_time, digital_code]",This feature addresses the rapid sequence of a...
9,ATO_Case_4,Fraudsters obtained the customer's online bank...,A fraudster executes a SIM swap attack by conv...,0.95,Time_Since_Last_Phone_Number_Change_to_First_Tx,"Measures the time duration (e.g., in minutes o...","[transaction_date, transaction_time, cnp_flag,...",This feature directly addresses the SIM swap c...
