# Generate Synthetic Dataset notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from data_manipulation_methods import *

## Dataset Generation

Some overall parameters

In [3]:
gamma = 0.3
np.random.seed(123) # for deterministic sampling (since generated data is not checked in)

### Step 1: Load data

In [4]:
file_dir = 'data'
file_name = df = 'Clothing_Shoes_and_Jewelry_5.json.gz'
# df = getDF(os.path.join(file_dir, file_name)) # full dataset
df = getDF(os.path.join(file_dir, file_name), max_lines=2e5) # large dataset (paper have about 16k after subsampling)
# df = getDF(os.path.join(file_dir, file_name), max_lines=1e3) # small dataset
print('Dataset Size:',df.shape[0])
df.head()

Num problem lines: 0: 100%|██████████| 200000/200000 [00:01<00:00, 136506.14it/s]


Dataset Size: 200000


Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,2,True,"05 4, 2014",A2IC3NZN488KWK,871167042,{'Format:': ' Paperback'},Ruby Tulip,"This book has beautiful photos, good and under...",Unique designs,1399161600,
1,5.0,0,True,"03 31, 2014",A30FG02C424EJ5,871167042,{'Format:': ' Paperback'},NWCancerBaby,Loved their approach in this book and that it ...,Great Book,1396224000,
2,5.0,0,True,"05 30, 2015",A2G9GWQEWWNQUB,871167042,{'Format:': ' Paperback'},Pamelarenee,great,Five Stars,1432944000,
3,5.0,0,True,"02 21, 2015",A3NI5OGW35SLY2,871167042,{'Format:': ' Paperback'},Gail,"Always love the way Eva thinks, and there are ...",Great Book!,1424476800,
4,5.0,0,True,"01 21, 2015",A1OPRA4NE56EV6,871167042,{'Format:': ' Paperback'},carol a inman,Nice patterns,Five Stars,1421798400,


### Step 2: Add labels and perturb data

In [5]:
label_Y(df)
synthetic_perturb(df)
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,above3Stars,perturbType,perturbedText,counterFactText
0,5.0,2,True,"05 4, 2014",A2IC3NZN488KWK,871167042,{'Format:': ' Paperback'},Ruby Tulip,"This book has beautiful photos, good and under...",Unique designs,1399161600,,True,1,"This book has beautiful photos, good and under...","This book has beautiful photos, good and under..."
1,5.0,0,True,"03 31, 2014",A30FG02C424EJ5,871167042,{'Format:': ' Paperback'},NWCancerBaby,Loved their approach in this book and that it ...,Great Book,1396224000,,True,0,Loved their approach in this book and that it ...,Loved their approach in this book and that it ...
2,5.0,0,True,"05 30, 2015",A2G9GWQEWWNQUB,871167042,{'Format:': ' Paperback'},Pamelarenee,great,Five Stars,1432944000,,True,0,great,great
3,5.0,0,True,"02 21, 2015",A3NI5OGW35SLY2,871167042,{'Format:': ' Paperback'},Gail,"Always love the way Eva thinks, and there are ...",Great Book!,1424476800,,True,1,"Always love thezzzzz way Eva thinks, and there...","Always love thexxxxx way Eva thinks, and there..."
4,5.0,0,True,"01 21, 2015",A1OPRA4NE56EV6,871167042,{'Format:': ' Paperback'},carol a inman,Nice patterns,Five Stars,1421798400,,True,1,Nice patterns,Nice patterns


### Step 3: Subsample to get P(Y=1|Z=1) = P(Y=0|Z=0) = gamma

In [6]:
Y_label = 'above3Stars'
Z_label = 'perturbType'

print("Before:")
N_Z1 = df[Y_label].sum()
N_Y1_Z1 = (df[Y_label] & (df[Z_label]==1)).sum()
P_Y1_Z1 = N_Y1_Z1 / N_Z1
N_Z0 = (~df[Y_label]).sum()
N_Y0_Z0 = ((~df[Y_label]) & (df[Z_label]==0)).sum()
P_Y0_Z0 = N_Y0_Z0 / N_Z0
print("N_Z1:", N_Z1,"N_Z1:",N_Z0)
print("N_Z1_Y1", N_Y1_Z1, "N_Z0_Y0",N_Y0_Z0)
print("P(Y1|Z1):",P_Y1_Z1,"P(Y0|Z0):", P_Y0_Z0)
subsample_data(df, gamma=gamma, Y_label=Y_label, Z_label=Z_label)
print("After:")
N_Z1 = df[Y_label].sum()
N_Y1_Z1 = (df[Y_label] & (df[Z_label]==1)).sum()
P_Y1_Z1 = N_Y1_Z1 / N_Z1
N_Z0 = (~df[Y_label]).sum()
N_Y0_Z0 = ((~df[Y_label]) & (df[Z_label]==0)).sum()
P_Y0_Z0 = N_Y0_Z0 / N_Z0
print("N_Z1:", N_Z1,"N_Z1:",N_Z0)
print("N_Z1_Y1", N_Y1_Z1, "N_Z0_Y0",N_Y0_Z0)
print("P(Y1|Z1):",P_Y1_Z1,"P(Y0|Z0):", P_Y0_Z0)

Before:
N_Z1: 164525 N_Z1: 35475
N_Z1_Y1 82505 N_Z0_Y0 17819
P(Y1|Z1): 0.5014739401306793 P(Y0|Z0): 0.5022973925299506
After:
N_Z1: 25223 N_Z1: 25223
N_Z1_Y1 7567 N_Z0_Y0 7567
P(Y1|Z1): 0.30000396463545176 P(Y0|Z0): 0.30000396463545176


### Step 4: Split to training and test set, making sure P(Y=1|Z=1) = P(Y=0|Z=0) = gamma 

In [7]:
train_ratio = 0.8
epsilon = 0.03
max_iter = 100

train_df, test_df = split_dataset(df, train_ratio=train_ratio, gamma=gamma, epsilon=epsilon,
                                  max_iter=100,
                                  Y_label=Y_label, Z_label=Z_label)

print("Train size:", train_df.shape[0], "Test size:", test_df.shape[0])
N_Z1 = train_df[Y_label].sum()
N_Y1_Z1 = (train_df[Y_label] & (train_df[Z_label]==1)).sum()
P_Y1_Z1 = N_Y1_Z1 / N_Z1
N_Z0 = (~train_df[Y_label]).sum()
N_Y0_Z0 = ((~train_df[Y_label]) & (train_df[Z_label]==0)).sum()
P_Y0_Z0 = N_Y0_Z0 / N_Z0
print("N_Z1:", N_Z1,"N_Z1:",N_Z0)
print("N_Z1_Y1", N_Y1_Z1, "N_Z0_Y0",N_Y0_Z0)
print("P(Y1|Z1):",P_Y1_Z1,"P(Y0|Z0):", P_Y0_Z0)

Train size: 40357 Test size: 10089
N_Z1: 20098 N_Z1: 20259
N_Z1_Y1 6041 N_Z0_Y0 6068
P(Y1|Z1): 0.3005771718578963 P(Y0|Z0): 0.2995212004541192


### Step 5: Save to .npy format

In [8]:
out_dir = './data'
header_name = 'header'
# train_ds_name = 'syn_train' # full dataset name
# test_ds_name = 'syn_test'
train_ds_name = 'syn_train_large' # large dataset name
test_ds_name = 'syn_test_large'
# train_ds_name = 'syn_train_small' # small dataset name
# test_ds_name = 'syn_test_small'

np.save(os.path.join(out_dir, header_name), df.keys().to_numpy())
np.save(os.path.join(out_dir, train_ds_name), train_df.to_numpy())
np.save(os.path.join(out_dir, test_ds_name), test_df.to_numpy())

## Visualize data

Original Text

In [9]:
sel_ind = 4
print(df['reviewText'].iloc[sel_ind])

One White Lie is a "love interrupted" and "secret baby" romance. I personally find it hard to accept why woman would just walk away and keep the baby a secret from the father even if he had wrong her, but that's just my personal opinion. Still, this story, while polished, just didn't grab me.

Brice Henderson broke away from his controlling father to make his own fortune, by sacrificing everything, including the one woman he loved, but he can afford no distractions. When his personal assistant needed to take sick leave to take care of family, Brice reluctantly agreed to hire a temporary replacement... who turned out to be the one woman he pushed away... Lena Razzi. Lena spent years trying to forget Brice, despite having a daily reminder... a son by Brice that not even Brice knew about. Lena needed the job, even if it's just a temporary gig, to take care of the baby and herself, and Brice hasn't changed. But as they spent more time together, neither can forget the good times they once h

Perturb type (ie Z= 1/0)

In [10]:
print("Perturb type:",df['perturbType'].iloc[sel_ind])

Perturb type: 1


Perturbed data and its counterfactual

In [11]:
print("Perturbed Text:")
print(df['perturbedText'].iloc[sel_ind])
print()
print("Counterfactual Text:")
print(df['counterFactText'].iloc[sel_ind])

Perturbed Text:
One White Lie is azzzzz "love interrupted" and "secret baby" romance. I personally find it hard to accept why woman would just walk away and keep thezzzzz baby azzzzz secret from thezzzzz father even if he had wrong her, but that's just my personal opinion. Still, this story, while polished, just didn't grab me.

Brice Henderson broke away from his controlling father to make his own fortune, by sacrificing everything, including thezzzzz one woman he loved, but he can afford no distractions. When his personal assistant needed to take sick leave to take care of family, Brice reluctantly agreed to hire azzzzz temporary replacement... who turned out to be thezzzzz one woman he pushed away... Lena Razzi. Lena spent years trying to forget Brice, despite having azzzzz daily reminder... azzzzz son by Brice that not even Brice knew about. Lena needed thezzzzz job, even if it's just azzzzz temporary gig, to take care of thezzzzz baby and herself, and Brice hasn't changed. But as 