# Generate Synthetic Dataset notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from data_manipulation_methods import *

## Dataset Generation

Some overall parameters

In [3]:
gamma = 0.3
np.random.seed(123) # for deterministic sampling (since generated data is not checked in)

### Step 1: Load data

In [4]:
file_dir = 'data'
file_name = df = 'Clothing_Shoes_and_Jewelry_5.json.gz'
df = getDF(os.path.join(file_dir, file_name)) # full dataset
# df = getDF(os.path.join(file_dir, file_name), max_lines=1e3) # small dataset
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,2,True,"05 4, 2014",A2IC3NZN488KWK,871167042,{'Format:': ' Paperback'},Ruby Tulip,"This book has beautiful photos, good and under...",Unique designs,1399161600,
1,5.0,0,True,"03 31, 2014",A30FG02C424EJ5,871167042,{'Format:': ' Paperback'},NWCancerBaby,Loved their approach in this book and that it ...,Great Book,1396224000,
2,5.0,0,True,"05 30, 2015",A2G9GWQEWWNQUB,871167042,{'Format:': ' Paperback'},Pamelarenee,great,Five Stars,1432944000,
3,5.0,0,True,"02 21, 2015",A3NI5OGW35SLY2,871167042,{'Format:': ' Paperback'},Gail,"Always love the way Eva thinks, and there are ...",Great Book!,1424476800,
4,5.0,0,True,"01 21, 2015",A1OPRA4NE56EV6,871167042,{'Format:': ' Paperback'},carol a inman,Nice patterns,Five Stars,1421798400,


### Step 2: Add labels and perturb data

In [5]:
label_Y(df)
synthetic_perturb(df)
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,above3Stars,perturbType,perturbedText,counterFactText
0,5.0,2,True,"05 4, 2014",A2IC3NZN488KWK,871167042,{'Format:': ' Paperback'},Ruby Tulip,"This book has beautiful photos, good and under...",Unique designs,1399161600,,True,1,"This book has beautiful photos, good and under...","This book has beautiful photos, good and under..."
1,5.0,0,True,"03 31, 2014",A30FG02C424EJ5,871167042,{'Format:': ' Paperback'},NWCancerBaby,Loved their approach in this book and that it ...,Great Book,1396224000,,True,0,Loved their approach in this book and that it ...,Loved their approach in this book and that it ...
2,5.0,0,True,"05 30, 2015",A2G9GWQEWWNQUB,871167042,{'Format:': ' Paperback'},Pamelarenee,great,Five Stars,1432944000,,True,0,great,great
3,5.0,0,True,"02 21, 2015",A3NI5OGW35SLY2,871167042,{'Format:': ' Paperback'},Gail,"Always love the way Eva thinks, and there are ...",Great Book!,1424476800,,True,1,"Always love thezzzzz way Eva thinks, and there...","Always love thexxxxx way Eva thinks, and there..."
4,5.0,0,True,"01 21, 2015",A1OPRA4NE56EV6,871167042,{'Format:': ' Paperback'},carol a inman,Nice patterns,Five Stars,1421798400,,True,1,Nice patterns,Nice patterns


### Step 3: Subsample to get P(Y=1|Z=1) = P(Y=0|Z=0) = gamma

In [6]:
Y_label = 'above3Stars'
Z_label = 'perturbType'

subsample_data(df, gamma=gamma, Y_label=Y_label, Z_label=Z_label)
N_Z1 = df[Y_label].sum()
N_Y1_Z1 = (df[Y_label] & (df[Z_label]==1)).sum()
P_Y1_Z1 = N_Y1_Z1 / N_Z1
N_Z0 = (~df[Y_label]).sum()
N_Y0_Z0 = ((~df[Y_label]) & (df[Z_label]==0)).sum()
P_Y0_Z0 = N_Y0_Z0 / N_Z0
print("N_Z1:", N_Z1,"N_Z1:",N_Z0)
print("N_Z1_Y1", N_Y1_Z1, "N_Z0_Y0",N_Y0_Z0)
print("P(Y1|Z1):",P_Y1_Z1,"P(Y0|Z0):", P_Y0_Z0)

N_Z1: 181 N_Z1: 181
N_Z1_Y1 54 N_Z0_Y0 54
P(Y1|Z1): 0.2983425414364641 P(Y0|Z0): 0.2983425414364641


### Step 4: Split to training and test set, making sure P(Y=1|Z=1) = P(Y=0|Z=0) = gamma 

In [7]:
train_ratio = 0.8
epsilon = 0.03
max_iter = 100

train_df, test_df = split_dataset(df, train_ratio=train_ratio, gamma=gamma, epsilon=epsilon,
                                  max_iter=100,
                                  Y_label=Y_label, Z_label=Z_label)

print("Train size:", train_df.shape[0], "Test size:", test_df.shape[0])
N_Z1 = train_df[Y_label].sum()
N_Y1_Z1 = (train_df[Y_label] & (train_df[Z_label]==1)).sum()
P_Y1_Z1 = N_Y1_Z1 / N_Z1
N_Z0 = (~train_df[Y_label]).sum()
N_Y0_Z0 = ((~train_df[Y_label]) & (train_df[Z_label]==0)).sum()
P_Y0_Z0 = N_Y0_Z0 / N_Z0
print("N_Z1:", N_Z1,"N_Z1:",N_Z0)
print("N_Z1_Y1", N_Y1_Z1, "N_Z0_Y0",N_Y0_Z0)
print("P(Y1|Z1):",P_Y1_Z1,"P(Y0|Z0):", P_Y0_Z0)

Train size: 290 Test size: 72
N_Z1: 146 N_Z1: 144
N_Z1_Y1 43 N_Z0_Y0 43
P(Y1|Z1): 0.2945205479452055 P(Y0|Z0): 0.2986111111111111


### Step 5: Save to .npy format

In [8]:
out_dir = './data'
header_name = 'header'
train_ds_name = 'syn_train' # full dataset name
test_ds_name = 'syn_test'
# train_ds_name = 'syn_train_small' # small dataset name
# test_ds_name = 'syn_test_small'

np.save(os.path.join(out_dir, header_name), df.keys().to_numpy())
np.save(os.path.join(out_dir, train_ds_name), train_df.to_numpy())
np.save(os.path.join(out_dir, test_ds_name), test_df.to_numpy())

## Visualize data

Original Text

In [9]:
sel_ind = 4
print(df['reviewText'].iloc[sel_ind])

Just love this series, and the catch ups with the previous character just love it.
It a good love story


Perturb type (ie Z= 1/0)

In [10]:
print("Perturb type:",df['perturbType'].iloc[sel_ind])

Perturb type: 0


Perturbed data and its counterfactual

In [11]:
print("Perturbed Text:")
print(df['perturbedText'].iloc[sel_ind])
print()
print("Counterfactual Text:")
print(df['counterFactText'].iloc[sel_ind])

Perturbed Text:
Just love this series, and thexxxxx catch ups with thexxxxx previous character just love it.
It axxxxx good love story

Counterfactual Text:
Just love this series, and thezzzzz catch ups with thezzzzz previous character just love it.
It azzzzz good love story
