# Prepare Data for Manual Analysis

In [51]:
# Load libraries
import pandas as pd
import os
import numpy as np
import ast

In [104]:
# Set file path
in_path = '/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/1-data/'

# Import data
aiid_deepfakes = pd.read_csv(os.path.join(in_path, 'deepfake-incidents/aiid_gpt_results.csv'))
aaic_deepfakes = pd.read_csv(os.path.join(in_path, 'deepfake-incidents/aaic_gpt_results.csv'))

In [105]:
# Drop irrelevant columns
cols_to_drop = ['is_deepfake', 'comment', 'gpt_classification', 'match', 'final']

# Filter on incidents that passed selection
aiid_deepfakes = aiid_deepfakes[aiid_deepfakes['final']].drop(cols_to_drop, axis=1)
aaic_deepfakes = aaic_deepfakes[aaic_deepfakes['final']].drop(cols_to_drop, axis=1)

### AIID Preparation

In [106]:
aiid_incidents = pd.read_csv(os.path.join(in_path, 'incident-repos/AIID-repository-09042025/incidents.csv'))
aiid_cset = pd.read_csv(os.path.join(in_path, 'incident-repos/AIID-repository-09042025/classifications_CSETv1.csv'))
aiid_reports = pd.read_csv(os.path.join(in_path, 'incident-repos/AIID-repository-09042025/reports.csv'))

In [107]:
aiid_incidents = aiid_incidents.drop(columns=['_id', 'description', 'title']).rename(
    columns={'Alleged deployer of AI system': 'deployer',
             'Alleged developer of AI system': 'developer',
             'Alleged harmed or nearly harmed parties': 'target'}
)

In [108]:
aiid_deepfakes = pd.merge(aiid_deepfakes, aiid_incidents, on='incident_id', how='left')

In [109]:
aiid_cset = aiid_cset[['Incident ID', 'Location Country (two letters)', 'Sector of Deployment']].rename(
    columns = {'Incident ID': 'incident_id',
               'Location Country (two letters)': 'location',
               'Sector of Deployment': 'sector'}
)

In [110]:
aiid_deepfakes = pd.merge(aiid_deepfakes, aiid_cset, on='incident_id', how='left')

In [111]:
aiid_deepfakes

Unnamed: 0,incident_id,title,description,justification,date,reports,deployer,developer,target,location,sector
0,943,Nonconsensual Explicit AI-Generated Images of ...,AI-generated explicit images of up to 60 femal...,Agreement,2025-02-20,[4776],"[""two-unidentified-year-11-students-at-gladsto...","[""unknown-deepfake-technology-developer""]","[""up-to-60-unnamed-students-at-gladstone-park-...",,
1,976,AI-Generated OB-GYN Health Influencers on TikT...,AI-generated avatars posing as OB-GYNs on TikT...,Fully synthetic persona,2025-03-08,"[4935,4983,4984,4985,4986]","[""unknown-tiktok-users""]","[""captions""]","[""tiktok-users""]",,
2,974,Deepfake Audio Impersonates U.S. Secretary of ...,A deepfake audio clip impersonating U.S. Secre...,Agreement,2025-03-04,[4933],"[""unknown-disinformation-actors""]","[""unknown-voice-cloning-technology""]","[""truth"",""relations-between-the-united-states-...",,
3,973,Canadian Fraud Ring Allegedly Used AI Voice Cl...,A Canadian fraud ring allegedly used AI-genera...,Agreement,2025-03-05,"[4923,4924,4925,4926,4927,4928]","[""scammers"",""fraudsters"",""canadian-fraud-ring-...","[""unknown-voice-cloning-technology-developer""]","[""grandparents-targeted-by-canadian-fraud-ring...",,
4,972,Russian Influence Operation Allegedly Uses AI ...,"A Russian disinformation campaign, linked to S...",Agreement,2024-09-25,"[4920,4921]","[""storm-1516"",""john-mark-dougan"",""russian-stat...","[""unknown-deepfake-technology-developer"",""unkn...","[""kamala-harris"",""american-voters"",""electoral-...",,
...,...,...,...,...,...,...,...,...,...,...,...
313,737,Amandine Le Pen Deepfake Account Misleads Thou...,"A TikTok account, ""Amandine Le Pen,"" created u...",Agreement,2024-04-16,"[3962,3981]","[""unknown-tiktok-user""]","[""unknown-deepfake-creators""]","[""le-pen-family"",""french-general-public""]",,
314,754,British Female Politicians Victimized by Deepf...,"British female politicians, including Angela R...",Agreement,2024-07-01,"[3989,5592]","[""unknown-deepfake-creators""]","[""unknown-deepfake-creators""]","[""stella-creasy"",""priti-patel"",""penny-mordaunt...",,
315,755,Deepfake Targets Olena Zelenska in Russian Dis...,A deepfake video falsely suggesting that Olena...,Agreement,2024-07-03,"[3990,4014,4881,4897,4899]","[""verite-cachee-france"",""russian-linked-disinf...","[""unknown-deepfake-creators""]","[""olena-zelenska"",""volodymyr-zelenskyy"",""gover...",,
316,147,Snippet Text: AI voice cloning is used in a hu...,,Agreement,2020-01-15,"[1496,1497,5135,5136,5137,5138]","[""unknown-transnational-fraud-ring"",""unknown-s...","[""unknown-voice-cloning-technology-developer"",...","[""unnamed-japanese-firm"",""unnamed-hong-kong-ba...",CN,"financial and insurance activities, other"


In [112]:
aiid_report_numbers = aiid_deepfakes[['incident_id', 'reports']]
aiid_report_numbers['reports'] = aiid_report_numbers['reports'].apply(ast.literal_eval)
aiid_report_numbers = aiid_report_numbers.explode('reports', ignore_index=True).rename(columns={'reports':'report_number'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aiid_report_numbers['reports'] = aiid_report_numbers['reports'].apply(ast.literal_eval)


In [113]:
# Filter only English reports
aiid_reports = aiid_reports[aiid_reports['language'] == 'en']

aiid_reports = aiid_reports[['report_number', 'text', 'title', 'source_domain']]

In [114]:
aiid_report_df = pd.merge(aiid_report_numbers, aiid_reports, on='report_number', how='left').dropna(subset='text')

In [115]:
aiid_report_df['details'] = aiid_report_df['title'].fillna('') + '; ' + aiid_report_df['source_domain'].fillna('') + '\n\n' + aiid_report_df['text'].fillna('')

In [116]:
aiid_report_df = aiid_report_df[['incident_id', 'details']]

aiid_merged_reports = aiid_report_df.groupby('incident_id', as_index=False).agg({'details': '\n\n'.join})

In [117]:
aiid_merged_reports

Unnamed: 0,incident_id,details
0,39,Fake Obama created using AI tool to make phone...
1,147,Bank Robbers Steal $35 Million by Deepfaking B...
2,198,Russian War Report: Hacked news program and de...
3,200,Fraudsters Used AI to Mimic CEO’s Voice in Unu...
4,201,XR Belgium posts deepfake of Belgian premier l...
...,...,...
305,1165,Grok generates fake Taylor Swift nudes without...
306,1168,False claim of UK colonels captured in Ukraine...
307,1170,Chris Cuomo mocked for response after falling ...
308,1175,Jessica Radcliffe Orca Incident A Hoax: The Tr...


In [118]:
aiid_deepfakes = pd.merge(aiid_deepfakes, aiid_merged_reports, on='incident_id', how='left')

In [119]:
aiid_deepfakes

Unnamed: 0,incident_id,title,description,justification,date,reports,deployer,developer,target,location,sector,details
0,943,Nonconsensual Explicit AI-Generated Images of ...,AI-generated explicit images of up to 60 femal...,Agreement,2025-02-20,[4776],"[""two-unidentified-year-11-students-at-gladsto...","[""unknown-deepfake-technology-developer""]","[""up-to-60-unnamed-students-at-gladstone-park-...",,,Police investigate sexually explicit deepfake ...
1,976,AI-Generated OB-GYN Health Influencers on TikT...,AI-generated avatars posing as OB-GYNs on TikT...,Fully synthetic persona,2025-03-08,"[4935,4983,4984,4985,4986]","[""unknown-tiktok-users""]","[""captions""]","[""tiktok-users""]",,,AI-generated ‘doctors’ are duping TikTok users...
2,974,Deepfake Audio Impersonates U.S. Secretary of ...,A deepfake audio clip impersonating U.S. Secre...,Agreement,2025-03-04,[4933],"[""unknown-disinformation-actors""]","[""unknown-voice-cloning-technology""]","[""truth"",""relations-between-the-united-states-...",,,"Fake Marco Rubio audio on Starlink, Ukraine sp..."
3,973,Canadian Fraud Ring Allegedly Used AI Voice Cl...,A Canadian fraud ring allegedly used AI-genera...,Agreement,2025-03-05,"[4923,4924,4925,4926,4927,4928]","[""scammers"",""fraudsters"",""canadian-fraud-ring-...","[""unknown-voice-cloning-technology-developer""]","[""grandparents-targeted-by-canadian-fraud-ring...",,,Dozens of Canadians are charged for scamming A...
4,972,Russian Influence Operation Allegedly Uses AI ...,"A Russian disinformation campaign, linked to S...",Agreement,2024-09-25,"[4920,4921]","[""storm-1516"",""john-mark-dougan"",""russian-stat...","[""unknown-deepfake-technology-developer"",""unkn...","[""kamala-harris"",""american-voters"",""electoral-...",,,Russia Targets Harris with Rhino-Shooting Hoax...
...,...,...,...,...,...,...,...,...,...,...,...,...
313,737,Amandine Le Pen Deepfake Account Misleads Thou...,"A TikTok account, ""Amandine Le Pen,"" created u...",Agreement,2024-04-16,"[3962,3981]","[""unknown-tiktok-user""]","[""unknown-deepfake-creators""]","[""le-pen-family"",""french-general-public""]",,,"Qui est Amandine Le Pen, ce deepfake au servic..."
314,754,British Female Politicians Victimized by Deepf...,"British female politicians, including Angela R...",Agreement,2024-07-01,"[3989,5592]","[""unknown-deepfake-creators""]","[""unknown-deepfake-creators""]","[""stella-creasy"",""priti-patel"",""penny-mordaunt...",,,British female politicians targeted by fake po...
315,755,Deepfake Targets Olena Zelenska in Russian Dis...,A deepfake video falsely suggesting that Olena...,Agreement,2024-07-03,"[3990,4014,4881,4897,4899]","[""verite-cachee-france"",""russian-linked-disinf...","[""unknown-deepfake-creators""]","[""olena-zelenska"",""volodymyr-zelenskyy"",""gover...",,,Deepfake video targeting Zelensky’s wife linke...
316,147,Snippet Text: AI voice cloning is used in a hu...,,Agreement,2020-01-15,"[1496,1497,5135,5136,5137,5138]","[""unknown-transnational-fraud-ring"",""unknown-s...","[""unknown-voice-cloning-technology-developer"",...","[""unnamed-japanese-firm"",""unnamed-hong-kong-ba...",CN,"financial and insurance activities, other",Bank Robbers Steal $35 Million by Deepfaking B...


In [120]:
aiid_deepfakes.to_csv(os.path.join(in_path, 'deepfake-incidents/aiid_dataset_for_analysis.csv'), index=False, encoding='utf-8')

## AIAAIC Preparation

In [143]:
# Import web data
aaic_webpages = pd.read_csv(os.path.join(in_path, 'web-data/aaic_webpages.csv')).drop(
    columns=['success', 'title', 'text_length', 'elapsed_time', 'error', 'URL']
)

aaic_repo = pd.read_csv(os.path.join(in_path, 'deepfake-incidents/aaic_deepfakes.csv')).drop(
    columns=['Headline', 'System', 'Technology', 'Link', 'is_deepfake', 'comment']
).rename(columns={
    'Incident_ID': 'incident_id',
    'Date': 'date',
    'Country': 'location',
    'Sector': 'sector',
    'Deployer': 'deployer',
    'Developer': 'developer'
})

In [144]:
aaic_dataset = pd.merge(aaic_deepfakes, aaic_webpages, on='incident_id', how='left')

aaic_dataset = pd.merge(aaic_dataset, aaic_repo, on='incident_id', how='left')

In [146]:
aaic_dataset.to_csv(os.path.join(in_path, 'deepfake-incidents/aaic_dataset_for_analysis.csv'), index=False, encoding='utf-8')