In [26]:
# Load libraries
import os
import pandas as pd

In [60]:
# Set file path
in_path = '/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/1-data/deepfake-incidents'

In [134]:
# Load data sets
aiid_incidents_df = pd.read_csv(os.path.join(in_path, 'aiid_incidents_deepfakes.csv'))
aiid_class_df = pd.read_csv(os.path.join(in_path, 'aiid_class_deepfakes.csv'))
aaic_df = pd.read_csv(os.path.join(in_path, 'aaic_deepfakes.csv'))

In [135]:
# Subset columns
aaic_df = aaic_df[['Incident_ID', 'Headline','is_deepfake', 'comment']]
aiid_class_df = aiid_class_df[['Incident ID', 'Known AI Goal Snippets', 'is_deepfake', 'comment']]
aiid_incidents_df = aiid_incidents_df[['incident_id', 'description', 'title', 'is_deepfake', 'comment']]

In [136]:
# Normalize columns
aaic_df = aaic_df.rename(columns={
    'Incident_ID': 'incident_id',
    'Headline': 'title',
    'Date': 'date'
}
)

aiid_class_df = aiid_class_df.rename(columns={
    'Incident ID': 'incident_id',
    'Known AI Goal Snippets': 'title'
}
)

aiid_incidents_df = aiid_incidents_df.rename(columns={
    'incident_ID': 'incident_id'
}
)

In [137]:
# Combine AIID data frames
aiid_class_df['description'] = ''

aiid_df = pd.concat([aiid_class_df, aiid_incidents_df], ignore_index=True)

# Create a flag for whether description exists
aiid_df["has_description"] = aiid_df["description"].notna() & (aiid_df["description"] != "")

# Sort by this flag so rows with description come first
aiid_df = aiid_df.sort_values(by="has_description", ascending=False)

# Drop duplicates, keeping the first occurrence (which will have a description if it exists)
aiid_df = aiid_df.drop_duplicates(subset="incident_id", keep="first")

# Drop the helper column
aiid_df = aiid_df.drop(columns=["has_description"])

## Local LLM Implementation

In [38]:
from langchain.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain

In [56]:
# Initialize Ollama model
llm = Ollama(model="mistral")

# Define the prompt template
prompt_template = """
You are an expert in AI ethics and threat analysis. Your task is to determine whether the following incident aligns with the definition of a deepfake.

Definition: A deepfake is a highly realistic image, video, or audio recording created with deep learning techniques that falsely depicts a real, identifiable person (or the impression of a real, identifiable person) saying or doing something they never did. Deepfakes are distinguished from other forms of synthetic media by their focus on manipulating or impersonating actual individuals, often with harmful or deceptive intent.

Incident:
<title>
{title}
</title>
<description>
{description}
</description>

Question: Does this incident qualify as a deepfake according to the definition above?
Answer only with TRUE or FALSE.
"""

prompt = PromptTemplate(
    input_variables=['title', 'description'],
    template = prompt_template
)

chain = prompt | llm

# Function to classify incidents
def classify_incident(title, description):
    result = chain.invoke({
        'title': title,
        'description': description
    }
    )
    return result.strip().upper()  # Ensure it's just TRUE/FALSE

In [59]:
# Example
title = "Alleged Deepfake CFO Scam Reportedly Costs Multinational Engineering Firm Arup $25 Million"
description = "A finance employee at the multinational engineering firm Arup was reportedly deceived into transferring $25 million by fraudsters using purported deepfake technology to impersonate the firm's CFO in a video call, according to the Hong Kong police."

print(classify_incident(title, description))

TRUE


In [47]:
print(prompt)

input_variables=['description', 'title'] input_types={} partial_variables={} template='\nYou are an expert in AI ethics and threat analysis. Your task is to determine whether the following incident aligns with the definition of a deepfake.\n\nDefinition: A deepfake is a highly realistic image, video, or audio recording created with deep learning techniques that falsely depicts a real, identifiable person (or the impression of a real, identifiable person) saying or doing something they never did. Deepfakes are distinguished from other forms of synthetic media by their focus on manipulating or impersonating actual individuals, often with harmful or deceptive intent.\n\nIncident:\n<title>\n{title}\n</title>\n<description>\n{description}\n</description>\n\nQuestion: Does this incident qualify as a deepfake according to the definition above?\nAnswer only with TRUE or FALSE.\n'


## OpenAI API Implementation

In [155]:
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
from openai import RateLimitError, APIError, Timeout
import time

load_dotenv()  # loads variables from .env
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Enable tqdm for pandas
tqdm.pandas(desc="Classifying events")

In [156]:
def gpt_classification_wrapper(title: str, description: str = "", max_retries: int = 5) -> str:
    """
    Classify whether an incident is a deepfake (TRUE/FALSE) using GPT-4o mini.
    Handles retries on rate limit or transient errors.
    """
    # Handle None input gracefully
    title = title or ""
    description = description or ""

    prompt = f"""
You are an expert in AI ethics and threat analysis. Your task is to determine whether the following incident aligns with the definition of a deepfake.  

Definition: A deepfake is a highly realistic image, video, or audio recording created with deep learning techniques that falsely depicts a real, identifiable person, or a synthetic person designed to impersonate one, saying or doing something they never did. In this analysis, deepfakes are distinguished from other forms of synthetic media by their focus on manipulating or impersonating actual individuals, often with harmful or deceptive intent.  

Incident:
<title>
{title}
</title>
<description>
{description}
</description>

Question: Does this incident qualify as a deepfake according to the definition above?  
Answer only with TRUE or FALSE.
    """

    retries = 0
    while retries < max_retries:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0  # deterministic
            )
            answer = response.choices[0].message.content.strip().upper()

            # Safety check → enforce only TRUE/FALSE output
            if answer in ["TRUE", "FALSE"]:
                return answer
            else:
                return "INVALID"

        except (RateLimitError, APIError, Timeout) as e:
            retries += 1
            wait_time = 2 ** retries
            print(f"API error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            print(f"Unexpected error: {e}")
            return "ERROR"

    # If max retries exceeded
    return "ERROR"

In [157]:
def classify_events(df: pd.DataFrame, title_col="title", desc_col="description") -> pd.DataFrame:
    """
    Apply GPT-based deepfake classification to the dataframe.
    Works with datasets that may or may not have a description column.
    Compares GPT classification with manual label 'is_deepfake' and adds a match column.
    Displays a progress bar during classification.
    """
    df = df.copy()

    # Ensure manual label exists
    if "is_deepfake" not in df.columns:
        raise ValueError("Dataframe must contain an 'is_deepfake' column for manual labels.")

    # Determine if description column exists
    has_description = desc_col in df.columns and df[desc_col].notna().any()

    # Apply GPT classification with progress bar
    if has_description:
        df["gpt_classification"] = df.progress_apply(
            lambda row: gpt_classification_wrapper(row[title_col], row[desc_col]),
            axis=1
        )
    else:
        df["gpt_classification"] = df[title_col].progress_apply(
            lambda t: gpt_classification_wrapper(t, "")
        )

    # Add match column
    df["match"] = df["is_deepfake"].astype(str).str.upper() == df["gpt_classification"]

    return df

In [158]:
#aiid_gpt_results = classify_events(aiid_df)

Classifying events: 100%|█████████████████████| 401/401 [04:44<00:00,  1.41it/s]


In [159]:
aiid_gpt_results.to_csv(os.path.join(in_path, 'aiid_gpt_results.csv'))

In [160]:
aiid_gpt_results['match'].value_counts()

match
True     349
False     52
Name: count, dtype: int64

In [162]:
#aaic_gpt_results = classify_events(aaic_df)

Classifying events: 100%|█████████████████████| 282/282 [03:38<00:00,  1.29it/s]


In [164]:
aaic_gpt_results.to_csv(os.path.join(in_path, 'aaic_gpt_results.csv'))

In [163]:
aaic_gpt_results['match'].value_counts()

match
True     213
False     69
Name: count, dtype: int64