# A Jupyter Notebook for Dataset Generation
This notebook demonstrates how to generate datasets using SCAR, SAR, and PG labeling mechanisms.

## Import Libraries
Import the necessary libraries, including NumPy and Pandas.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd

## Set Random Seed
Set a random seed using NumPy for reproducibility.

In [2]:
# Set a random seed for reproducibility
np.random.seed(42)

## Define Parameters
Define the parameters for the positive and negative distributions, including means and covariance matrices.

In [10]:
# Number of samples for each class
n_samples = 20
n_labeled_positive = 5

# Define the parameters for the positive and negative distributions
positive_mean = [1, 1]
positive_cov = [[1, 0], [0, 1]]
negative_mean = [-1, -1]
negative_cov = [[1, 0], [0, 1]]

dict_label = {'positive': 1, 'unlabeled': 0}

## Generate Data
Generate true positive and negative samples using NumPy's multivariate_normal function.

In [4]:
# Generate true positive and negative samples
true_positives = np.random.multivariate_normal(positive_mean, positive_cov, n_samples)
true_negatives = np.random.multivariate_normal(negative_mean, negative_cov, n_samples)

## Create SCAR Dataset
Generate the SCAR dataset by randomly labeling a subset of positive samples and combining them with unlabeled data.

In [6]:
# --- SCAR Dataset ---
scar_labeled_indices = np.random.choice(n_samples, n_labeled_positive, replace=False)
scar_labels = ['unlabeled'] * (2 * n_samples)
scar_true_labels = ['positive'] * n_samples + ['negative'] * n_samples
for i in scar_labeled_indices:
    scar_labels[i] = 'positive'
scar_data_scar = np.vstack((true_positives, true_negatives))
scar_df = pd.DataFrame(scar_data_scar, columns=['feature_1', 'feature_2'])
scar_df['true_label'] = scar_true_labels
scar_df['observed_label'] = scar_labels

print("\nSCAR Dataset:")
print(scar_df)


SCAR Dataset:
    feature_1  feature_2 true_label observed_label
0    1.496714   0.861736   positive       positive
1    1.647689   2.523030   positive      unlabeled
2    0.765847   0.765863   positive      unlabeled
3    2.579213   1.767435   positive      unlabeled
4    0.530526   1.542560   positive      unlabeled
5    0.536582   0.534270   positive      unlabeled
6    1.241962  -0.913280   positive      unlabeled
7   -0.724918   0.437712   positive       positive
8   -0.012831   1.314247   positive       positive
9    0.091976  -0.412304   positive       positive
10   2.465649   0.774224   positive      unlabeled
11   1.067528  -0.424748   positive      unlabeled
12   0.455617   1.110923   positive      unlabeled
13  -0.150994   1.375698   positive      unlabeled
14   0.399361   0.708306   positive      unlabeled
15   0.398293   2.852278   positive      unlabeled
16   0.986503  -0.057711   positive      unlabeled
17   1.822545  -0.220844   positive      unlabeled
18   1.208864  -

In [11]:
scar_df['observed_label'] = scar_df['observed_label'].apply(lambda x: dict_label[x])
scar_df.to_csv('data/scar_dataset.csv', index=False)

## Create SAR Dataset
Generate the SAR dataset by using a linear decision boundary to assign labeling probabilities.

In [7]:
# --- SAR Dataset ---
sar_true_labels = ['positive'] * n_samples + ['negative'] * n_samples
sar_data_sar = np.vstack((true_positives, true_negatives))
sar_labels = ['unlabeled'] * (2 * n_samples)

# Define a simple linear decision boundary for SAR labeling probability
weights = np.array([1, -1])
bias = 0
distances = np.dot(true_positives, weights) + bias
probabilities = 1 / (1 + np.exp(-distances)) # Sigmoid function

sar_labeled_indices = np.random.choice(n_samples, size=n_labeled_positive, replace=False, p=probabilities / np.sum(probabilities))
for i in sar_labeled_indices:
    sar_labels[i] = 'positive'

sar_df = pd.DataFrame(sar_data_sar, columns=['feature_1', 'feature_2'])
sar_df['true_label'] = sar_true_labels
sar_df['observed_label'] = sar_labels

print("\nSAR Dataset:")
print(sar_df)


SAR Dataset:
    feature_1  feature_2 true_label observed_label
0    1.496714   0.861736   positive      unlabeled
1    1.647689   2.523030   positive      unlabeled
2    0.765847   0.765863   positive      unlabeled
3    2.579213   1.767435   positive       positive
4    0.530526   1.542560   positive      unlabeled
5    0.536582   0.534270   positive      unlabeled
6    1.241962  -0.913280   positive      unlabeled
7   -0.724918   0.437712   positive      unlabeled
8   -0.012831   1.314247   positive      unlabeled
9    0.091976  -0.412304   positive      unlabeled
10   2.465649   0.774224   positive       positive
11   1.067528  -0.424748   positive      unlabeled
12   0.455617   1.110923   positive      unlabeled
13  -0.150994   1.375698   positive      unlabeled
14   0.399361   0.708306   positive       positive
15   0.398293   2.852278   positive      unlabeled
16   0.986503  -0.057711   positive       positive
17   1.822545  -0.220844   positive       positive
18   1.208864  -0

In [12]:
sar_df['observed_label'] = sar_df['observed_label'].apply(lambda x: dict_label[x])
sar_df.to_csv('data/sar_dataset.csv', index=False)

## Create PG Dataset
Generate the PG dataset by labeling all positive samples and leaving negative samples unlabeled.

In [8]:
# --- PG Dataset ---
pg_true_labels = ['positive'] * n_samples + ['negative'] * n_samples
pg_data_pg = np.vstack((true_positives, true_negatives))
pg_labels = ['positive'] * n_samples + ['unlabeled'] * n_samples
pg_df = pd.DataFrame(pg_data_pg, columns=['feature_1', 'feature_2'])
pg_df['true_label'] = pg_true_labels
pg_df['observed_label'] = pg_labels

print("\nPG Dataset:")
print(pg_df)


PG Dataset:
    feature_1  feature_2 true_label observed_label
0    1.496714   0.861736   positive       positive
1    1.647689   2.523030   positive       positive
2    0.765847   0.765863   positive       positive
3    2.579213   1.767435   positive       positive
4    0.530526   1.542560   positive       positive
5    0.536582   0.534270   positive       positive
6    1.241962  -0.913280   positive       positive
7   -0.724918   0.437712   positive       positive
8   -0.012831   1.314247   positive       positive
9    0.091976  -0.412304   positive       positive
10   2.465649   0.774224   positive       positive
11   1.067528  -0.424748   positive       positive
12   0.455617   1.110923   positive       positive
13  -0.150994   1.375698   positive       positive
14   0.399361   0.708306   positive       positive
15   0.398293   2.852278   positive       positive
16   0.986503  -0.057711   positive       positive
17   1.822545  -0.220844   positive       positive
18   1.208864  -0.

In [13]:
pg_df['observed_label'] = pg_df['observed_label'].apply(lambda x: dict_label[x])
pg_df.to_csv('data/pg_dataset.csv', index=False)