In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:

# -------------------------------------------------
# 1) Read Pickle Files (Same as your original function)
# -------------------------------------------------
def read_pickle_files(base_directory, feature_type, speaker_no, train_type):
    data = []
    labels = []

    speaker_path = os.path.join(base_directory, 'COMB_US', speaker_no, train_type)

    for label in os.listdir(speaker_path):
        label_path = os.path.join(speaker_path, label)
        feature_path = os.path.join(label_path, feature_type)

        if not os.path.exists(feature_path):
            continue

        for file in os.listdir(feature_path):
            if file.endswith('.pkl'):
                file_path = os.path.join(feature_path, file)

                with open(file_path, 'rb') as f:
                    feature_data = pickle.load(f)
                    data.append(feature_data)
                    labels.append(label)

    return pd.DataFrame({'features': data, 'label': labels})

In [3]:
data_df.features[1].shape

NameError: name 'data_df' is not defined

In [4]:
base_directory = "/data/Deep_Fake_Data/Features_no_padding"
feature_type = "wav2vec2_base_s2st_es_voxpopuli"
speaker_no = "p266"  # example
train_type = "train"



data_df = read_pickle_files(base_directory, feature_type, speaker_no, train_type)

In [5]:

# -------------------------------------------------
# 2) Train and Evaluate One-Class SVM
# -------------------------------------------------

# Separate out Original vs Other
original_data = data_df[data_df["label"] == "Original"]["features"].tolist()
other_data = data_df[data_df["label"] != "Original"]["features"].tolist()
other_labels = data_df[data_df["label"] != "Original"]["label"].unique().tolist()  # not strictly needed for labeling, but can help in analysis
print(other_labels)

['f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'GradTTS', 'matcha', 'NaturalSpeech2', 'StyleTTS2', 'pflow', 'A10', 'A18', 'A19', 'A15', 'A16', 'A07', 'A17', 'A09', 'A13', 'A08', 'A12', 'A11', 'A14']


In [6]:

# Convert features to numpy arrays
original_data = np.array(original_data, dtype=object)
other_data = np.array(other_data, dtype=object)


In [7]:

# Train-test split on the Original data: 90% train, 10% test
train_original, test_original = train_test_split(
    original_data, 
    test_size=0.1, 
    random_state=42
)


In [8]:
train_original.shape

(344, 768)

In [9]:

# Combine test_original with all other_data to form the test set
# We'll label test_original as +1 (inlier) and other_data as -1 (outlier)
X_test = np.concatenate([test_original, other_data], axis=0)
y_test = np.concatenate([np.ones(len(test_original)), -1 * np.ones(len(other_data))], axis=0)


In [10]:
test_original.shape

(39, 768)

In [11]:
other_data.shape

(5436, 768)

In [12]:
X_test[0:39] == test_original

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [13]:

# -------------------------------------------------
# Scale features
# -------------------------------------------------
# You might want to fit the scaler only on "train_original" 
# to simulate real one-class detection scenario.
scaler = StandardScaler()
train_original_scaled = scaler.fit_transform(train_original.tolist())
X_test_scaled = scaler.transform(X_test.tolist())
ot_scaled = scaler.transform(other_data.tolist())
test_original =



SyntaxError: invalid syntax (1227808927.py, line 10)

In [63]:

# -------------------------------------------------
# Train One-Class SVM
# -------------------------------------------------
# nu ~ proportion of outliers you expect among the training data
# kernel can be 'rbf', 'linear', etc. 
clf = OneClassSVM(nu=0.05, kernel='rbf', gamma='auto')
clf.fit(train_original_scaled)


In [64]:

# -------------------------------------------------
# Predict
# -------------------------------------------------
# SVM returns +1 for inliers, -1 for outliers
y_pred = clf.predict(X_test_scaled)


In [72]:
X_test_scaled[36]

array([ 0.91299388,  0.6195381 ,  0.53014551, ...,  0.81838276,
       -0.76933577, -1.41464437])

In [70]:
clf.predict(X_test_scaled[36])

ValueError: Expected 2D array, got 1D array instead:
array=[ 0.91299388  0.6195381   0.53014551 ...  0.81838276 -0.76933577
 -1.41464437].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [69]:
y_pred[0:35]

array([-1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,
        1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,
        1])

In [None]:
_test

array([ 1.,  1.,  1., ..., -1., -1., -1.])

In [67]:

# -------------------------------------------------
# Evaluate
# -------------------------------------------------
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred, target_names=["Outlier (-1)", "Inlier (+1)"], zero_division=0)

results_dict = {
    "accuracy": acc,
}

In [68]:
results_dict

{'accuracy': 0.9983561643835617}