# Generate Natural Dataset 2

Follow up from generate natural dataset -> recall output was dataset.csv

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
from data_manipulation_methods import *

In [3]:
data_dir = 'data'

gamma = 0.3
np.random.seed(123) # for deterministic sampling (since generated data is not checked in)

## Step 1: Load Data

In [4]:
full_df = pd.read_csv(os.path.join(data_dir, 'dataset.csv'))
full_df.head()

Unnamed: 0,original_sentence,flipped_sentence,star_count,vote_count,conversion_success
0,"This book has beautiful photos, good and under...","This book has reprehensive photos, inessential...",5.0,2,yes
1,"I love the ideas but have no access sea glass,...","I love the ideas but have no access sea glass,...",4.0,0,yes
2,As someone who has never possessed the manual ...,As someone who has never possessed the manual ...,5.0,6,yes
3,beautiful layout and fun pictures. as a collec...,miserly layout and fun pictures. as a collecto...,5.0,0,yes
4,Some days ago I received my fabulous book: Org...,Some days ago I received my loose book: Organi...,5.0,0,yes


## Step 2: Only keep those with successful conversion

In [5]:
filtered_df = full_df[full_df['conversion_success'] == 'yes']
print(f"Remaining data count: {filtered_df.shape[0]}")
filtered_df.head()

Remaining data count: 49110


Unnamed: 0,original_sentence,flipped_sentence,star_count,vote_count,conversion_success
0,"This book has beautiful photos, good and under...","This book has reprehensive photos, inessential...",5.0,2,yes
1,"I love the ideas but have no access sea glass,...","I love the ideas but have no access sea glass,...",4.0,0,yes
2,As someone who has never possessed the manual ...,As someone who has never possessed the manual ...,5.0,6,yes
3,beautiful layout and fun pictures. as a collec...,miserly layout and fun pictures. as a collecto...,5.0,0,yes
4,Some days ago I received my fabulous book: Org...,Some days ago I received my loose book: Organi...,5.0,0,yes


In [6]:
filtered_df = filtered_df.assign(above3Stars=filtered_df['star_count'] > 3)
filtered_df.head()

Unnamed: 0,original_sentence,flipped_sentence,star_count,vote_count,conversion_success,above3Stars
0,"This book has beautiful photos, good and under...","This book has reprehensive photos, inessential...",5.0,2,yes,True
1,"I love the ideas but have no access sea glass,...","I love the ideas but have no access sea glass,...",4.0,0,yes,True
2,As someone who has never possessed the manual ...,As someone who has never possessed the manual ...,5.0,6,yes,True
3,beautiful layout and fun pictures. as a collec...,miserly layout and fun pictures. as a collecto...,5.0,0,yes,True
4,Some days ago I received my fabulous book: Org...,Some days ago I received my loose book: Organi...,5.0,0,yes,True


## Step 3: Data balancing

Want: $P(Y=1|Z=1) = P(Y=0|Z=0) = \gamma$

See Appendix -> B.2.1

In [7]:
# 1. drop reviews with 0 helpful votes (V) until
# P(V>0|Z=0) > 1 - gamma

print('Before:')

num_above3stars = (filtered_df['above3Stars']).sum()
num_above3stars_hasVote = (filtered_df[filtered_df['above3Stars']]['vote_count'] > 0).sum()
print(f'num Z=1 = {num_above3stars}')
print(f'num V>0|Z=1 = {num_above3stars_hasVote}')
print(f'P(V>0|Z=1) = {num_above3stars_hasVote/num_above3stars}')
print(f'gamma = {gamma}')

num_leq3stars = (~filtered_df['above3Stars']).sum()
num_leq3stars_hasVote = (filtered_df[~filtered_df['above3Stars']]['vote_count'] > 0).sum()
print(f'num Z=0 = {num_leq3stars}')
print(f'num V>0|Z=0 = {num_leq3stars_hasVote}')
print(f'P(V>0|Z=0) = {num_leq3stars_hasVote/num_leq3stars}')
print(f'1 - gamma = {1 - gamma}')


drop_data(filtered_df, gamma=gamma, V_label='vote_count')


print('\n\nAfter:')

num_above3stars = (filtered_df['above3Stars']).sum()
num_above3stars_hasVote = (filtered_df[filtered_df['above3Stars']]['vote_count'] > 0).sum()
print(f'num Z=1 = {num_above3stars}')
print(f'num V>0|Z=1 = {num_above3stars_hasVote}')
print(f'P(V>0|Z=1) = {num_above3stars_hasVote/num_above3stars}')
print(f'gamma = {gamma}')

num_leq3stars = (~filtered_df['above3Stars']).sum()
num_leq3stars_hasVote = (filtered_df[~filtered_df['above3Stars']]['vote_count'] > 0).sum()
print(f'num Z=0 = {num_leq3stars}')
print(f'num V>0|Z=0 = {num_leq3stars_hasVote}')
print(f'P(V>0|Z=0) = {num_leq3stars_hasVote/num_leq3stars}')
print(f'1 - gamma = {1 - gamma}')

Before:
num Z=1 = 40160
num V>0|Z=1 = 3053
P(V>0|Z=1) = 0.07602091633466135
gamma = 0.3
num Z=0 = 8950
num V>0|Z=0 = 1480
P(V>0|Z=0) = 0.1653631284916201
1 - gamma = 0.7


After:
num Z=1 = 6122
num V>0|Z=1 = 3053
P(V>0|Z=1) = 0.4986932375040836
gamma = 0.3
num Z=0 = 2114
num V>0|Z=0 = 1480
P(V>0|Z=0) = 0.7000946073793756
1 - gamma = 0.7


In [8]:
# 2. find smallest Tz s.t. P(V>T1|Z=1) < gamma and P(V>T0|Z=0) < 1-gamma

T_1, T_0 = find_smallest_Tz(filtered_df, gamma=gamma, V_label='vote_count')

num_above3stars = (filtered_df['above3Stars']).sum()
num_above3stars_abvT1 = (filtered_df[filtered_df['above3Stars']]['vote_count'] > T_1).sum()
print(f'T_1 = {T_1}')
print(f'num Z=1 = {num_above3stars}')
print(f'num V>T1|Z=1 = {num_above3stars_abvT1}')
print(f'P(V>T1|Z=1) = {num_above3stars_abvT1/num_above3stars}')
print(f'gamma = {gamma}')

print('\n')

num_leq3stars = (~filtered_df['above3Stars']).sum()
num_leq3stars_abvT0 = (filtered_df[~filtered_df['above3Stars']]['vote_count'] > T_0).sum()
print(f'T_0 = {T_0}')
print(f'num Z=0 = {num_leq3stars}')
print(f'num V>T0|Z=0 = {num_leq3stars_abvT0}')
print(f'P(V>T0|Z=0) = {num_leq3stars_abvT0/num_leq3stars}')
print(f'1 - gamma = {1 - gamma}')

T_1 = 3
num Z=1 = 6122
num V>T1|Z=1 = 1528
P(V>T1|Z=1) = 0.24959163672002613
gamma = 0.3


T_0 = 2
num Z=0 = 2114
num V>T0|Z=0 = 944
P(V>T0|Z=0) = 0.44654683065279094
1 - gamma = 0.7


In [9]:
# 3. set Y=1[V>T0] for each Z=0, Y=1[>T1] for each Z=1
# 4. random flip Y=0 to Y=1 in examples where (Z=0,V=T0+1) or (Z=1,V=T1+1), until
# P(Y=1|Z=1) > gamma and P(Y=1|Z=0) > 1-gamma

filtered_df = assign_natural_Y(filtered_df, T_1=T_1, T_0=T_0,
                               gamma=gamma,
                               V_label='vote_count')

num_above3stars = (filtered_df['above3Stars']).sum()
num_above3stars_abvTH = (filtered_df[filtered_df['above3Stars']]['aboveVThreshold']).sum()
print(f'num Z=1 = {num_above3stars}')
print(f'num Y=1|Z=1 = {num_above3stars_abvTH}')
print(f'P(Y=1|Z=1) = {num_above3stars_abvTH/num_above3stars}')
print(f'gamma = {gamma}')

print('\n')

num_leq3stars = (~filtered_df['above3Stars']).sum()
num_leq3stars_abvTH = (filtered_df[~filtered_df['above3Stars']]['aboveVThreshold']).sum()
print(f'num Z=0 = {num_leq3stars}')
print(f'num Y=1|Z=0 = {num_leq3stars_abvTH}')
print(f'P(Y=1|Z=0) = {num_leq3stars_abvTH/num_leq3stars}')
print(f'1 - gamma = {1 - gamma}')

num Z=1 = 6122
num Y=1|Z=1 = 1837
P(Y=1|Z=1) = 0.3000653381247958
gamma = 0.3


num Z=0 = 2114
num Y=1|Z=0 = 1480
P(Y=1|Z=0) = 0.7000946073793756
1 - gamma = 0.7


## Step 4: Split data to train, val and test set

In [10]:
train_df, test_df = split_dataset(filtered_df, gamma=gamma,
                                  Y_label='aboveVThreshold', Z_label='above3Stars')

train_df, val_df = split_dataset(train_df, gamma=gamma,
                                 Y_label='aboveVThreshold', Z_label='above3Stars')

print("Train size:", train_df.shape[0], "Val size:", val_df.shape[0], "Test size:", test_df.shape[0])
N_Z1 = train_df['above3Stars'].sum()
N_Y1_Z1 = (train_df['above3Stars'] & (train_df['aboveVThreshold'])).sum()
P_Y1_Z1 = N_Y1_Z1 / N_Z1
N_Z0 = (~train_df['above3Stars']).sum()
N_Y0_Z0 = ((~train_df['above3Stars']) & (train_df['aboveVThreshold'])).sum()
P_Y0_Z0 = N_Y0_Z0 / N_Z0
print("N_Z1:", N_Z1,"N_Z0:",N_Z0)
print("N_Z1_Y1", N_Y1_Z1, "N_Z0_Y0",N_Y0_Z0)
print("P(Y1|Z1):",P_Y1_Z1,"P(Y0|Z0):", P_Y0_Z0)

Train size: 5271 Val size: 1318 Test size: 1647
N_Z1: 3919 N_Z1: 1352
N_Z1_Y1 1178 N_Z0_Y0 937
P(Y1|Z1): 0.3005868844092881 P(Y0|Z0): 0.6930473372781065


## Step 5: Save data to .npz format

In [11]:
train_df.head()

Unnamed: 0,original_sentence,flipped_sentence,star_count,vote_count,conversion_success,above3Stars,aboveVThreshold
64729,I was up in at Mount Shasta for a mushroom fes...,I was up in at Mount Shasta for a mushroom fes...,5.0,2,yes,True,False
72282,Daily use and pretty much goes with everything...,Daily use and pretty much goes with everything...,3.0,4,yes,False,True
4299,my son loved the jet pack but on of the straps...,my son unjustifiable the jet pack but on of th...,2.0,0,yes,False,False
75976,Beautiful watch. I have been looking for a wa...,Beautiful watch. I have been looking for a wat...,5.0,2,yes,True,False
70220,"This watch is attractive, it fits well, and it...","This watch is attractive, it fits well, and it...",4.0,10,yes,True,True


In [12]:
out_dir = './data'
header_name = 'header'
# train_ds_name = 'nat_train' # full dataset name
# val_ds_name = 'nat_val' # full dataset name
# test_ds_name = 'nat_test'
# train_ds_name = 'nat_train_large' # large dataset name
# val_ds_name = 'nat_val_large'
# test_ds_name = 'nat_test_large'
train_ds_name = 'nat_train_small' # small dataset name
val_ds_name = 'nat_val_small' # large dataset name
test_ds_name = 'nat_test_small'

save_df(train_df, os.path.join(out_dir, train_ds_name), text_keys=['original_sentence', 'flipped_sentence'])
save_df(val_df, os.path.join(out_dir, val_ds_name), text_keys=['original_sentence', 'flipped_sentence'])
save_df(test_df, os.path.join(out_dir, test_ds_name), text_keys=['original_sentence', 'flipped_sentence'])