## Load the dataset and embedding with sentence BERT and Tf-IDF

In [None]:
import pandas as pd

# read MEGA dataset
train_df = pd.read_csv('cross_domains_cross_models/train.csv')
test_df = pd.read_csv('cross_domains_cross_models/test.csv')
valid_df = pd.read_csv('cross_domains_cross_models/valid.csv')

# combine
combined_df = pd.concat([train_df, test_df, valid_df], ignore_index=True)

subset_df = combined_df.sample(n=10000, random_state=42)
encoder_train = combined_df.drop(subset_df.index).sample(n=10000, random_state=43)

# save
subset_df.to_csv('data/data.csv', index=False)
encoder_train.to_csv('data/encoder_data.csv', index=False)
print(f"{len(subset_df)} into data/data.csv")

10000 into data/data.csv


In [None]:
import numpy as np

unique, counts = np.unique(combined_df["label"].values, return_counts=True)
label_dist = dict(zip(unique, counts))

print("Label Distribution:")
for label, count in label_dist.items():
    percent = count / counts.sum() * 100
    print(f"  Label {label}: {count} samples ({percent:.2f}%)")

Label Distribution:
  Label 0: 281824 samples (65.13%)
  Label 1: 150858 samples (34.87%)


In [None]:
import hashlib 
import pandas as pd
import os

deepfake_name_dct = {'OpenAI-GPT':['gpt-3.5-trubo','text-davinci-002', 'text-davinci-003'],
            'Meta-LLaMA':['13B', '30B', '65B', '7B'],
            'GLM-130B':['GLM130B'],
            'Google-FLAN-T5':['flan_t5_base', 'flan_t5_large','flan_t5_small', 'flan_t5_xl', 'flan_t5_xxl'],
            'Facebook-OPT':['opt_1.3b', 'opt_125m', 'opt_13b', 'opt_2.7b', 'opt_30b', 'opt_350m', 'opt_6.7b', 'opt_iml_30b','opt_iml_max_1.3b'],
            'BigScience':['bloom_7b','t0_11b', 't0_3b'],
            'EleutherAI':['gpt_j','gpt_neox'],
            'human':['human']}
deepfake_model_set ={'OpenAI-GPT':0,'Meta-LLaMA':1,'GLM-130B':2,'Google-FLAN-T5':3,
            'Facebook-OPT':4,'BigScience':5,'EleutherAI':6,'human':7}

def stable_long_hash(input_string):
    hash_object = hashlib.sha256(input_string.encode())
    hex_digest = hash_object.hexdigest()
    int_hash = int(hex_digest, 16)
    long_long_hash = (int_hash & ((1 << 63) - 1))
    return long_long_hash

def process_data(dataset):
    data_list=[]
    for i in range(len(dataset)):
        text,label,src=dataset[i]['text'],str(dataset[i]['label']),dataset[i]['src']
        data_list.append((text,label,src,stable_long_hash(text)))
    return data_list

def load_deepfake(data_file='data.csv'):
    if not os.path.exists(data_file):
        raise FileNotFoundError(f"{data_file} not exist")
    
    df = pd.read_csv(data_file, usecols=["text", "label", "src"])
    
    data_dict_list = []
    for i in range(len(df)):
        dct = {
            'text': df.loc[i, 'text'],
            'label': df.loc[i, 'label'],
            'src': df.loc[i, 'src']
        }
        data_dict_list.append(dct)
    
    processed_data = process_data(data_dict_list)
    
    return processed_data


In [None]:
# embedding_data.py
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import os

# -------------------------
# Configuration
# -------------------------
DATA_FILE = "data/data.csv"                 # Input data file
EMBEDDING_FILE = "data/text_embeddings.npy"  # File to save embeddings
MODEL_NAME = "sentence-transformers/msmarco-MiniLM-L6-v3"

# -------------------------
# Read data
# -------------------------
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"{DATA_FILE} does not exist. Please generate it first.")

df = pd.read_csv(DATA_FILE, usecols=["text"])
texts = df["text"].tolist()
print(f"Loaded {len(texts)} text samples")

# -------------------------
# Load model
# -------------------------
print("Loading model...")
model = SentenceTransformer(MODEL_NAME)

# -------------------------
# Generate embeddings
# -------------------------
print("Generating embeddings...")
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)

# -------------------------
# Save embeddings
# -------------------------
np.save(EMBEDDING_FILE, embeddings)
print(f"Embeddings saved to {EMBEDDING_FILE}")
print(f"Embedding shape: {embeddings.shape}")


共读取 10000 条文本
正在加载模型...
正在生成 embedding...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

embedding 已保存到 data/text_embeddings.npy
embedding 形状: (10000, 384)


## TF-IDF

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import os

DATA_FILE = "data/data.csv"
OUTPUT_FILE = "data/text_embedding_tfidfs.npy"

# Read text data
df = pd.read_csv(DATA_FILE, usecols=["text"])
texts = df["text"].tolist()

# TF-IDF
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(texts)  # Returns a sparse matrix
print("TF-IDF shape:", tfidf.shape)

# Dimensionality reduction using TruncatedSVD
svd = TruncatedSVD(n_components=384, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf)  # Outputs a dense matrix
print("Reduced shape:", tfidf_reduced.shape)
print("Explained variance ratio:", svd.explained_variance_ratio_.sum())

# Save embeddings
np.save(OUTPUT_FILE, tfidf_reduced)
print(f"Embeddings saved to {OUTPUT_FILE}")


TF-IDF shape: (10000, 56111)
降维后 shape: (10000, 384)
保留方差比例: 0.2758343709159965
Embedding 已保存到 data/text_embedding_tfidf.npy
