# Dataset preparation - SameSentiment Amazon - Base

In [None]:
import pickle
from pathlib import Path

#import numpy as np
#import pandas as pd
from tqdm import tqdm
from transformers.trainer_utils import set_seed

tqdm.pandas()

In [None]:
# see readme.txt file for more details
fn_base = Path("data_raw/sentiment/amazon_v1")

fn_reviews_kindle = fn_base / "reviews_Kindle_Store_5.json.gz"

data_amazon_path = Path("data/sentiment/amazon_v1/")

fn_amazon_kindle_df = data_amazon_path / "kindle_pairs_df.p"
fn_amazon_df = data_amazon_path / "pairs_df.p"

data_amazon_b_tdt_path = Path("data/sentiment/amazon-pair-b/")
data_amazon_b_rand_tdt_path = Path("data/sentiment/amazon-pair-rand-b/")

In [None]:
fn_amazon_kindle_df.parent.mkdir(parents=True, exist_ok=True)

In [None]:
from data_prep import load_amazon_reviews
from data_prep import load_amazon_reviews_all

from data_prep_sentiment_amazon_v1 import make_inv_topic2id
from data_prep_sentiment_amazon_v1 import make_inv_id2topic

from data_prep import filter_min_review_freq
from data_prep import filter_both_good_bad

from data_prep import make_pairs_good_bad
#from data_prep import make_pairs_good_bad_over_business
from data_prep import make_pairs_negative
#from data_prep import make_pairs_negative_over_business

from data_prep import make_or_load_pairs
from data_prep import make_or_load_pairs_over_businesses

from data_prep import split_df
from data_prep import write_pair_tdt_tsv

Load raw data

In [None]:
print("Load reviews ...")
# df = load_amazon_reviews(fn_reviews_kindle)
# df["topic"] = "Kindle"
df = load_amazon_reviews_all(fn_base)
print(f"  got: {len(df)} reviews")

print("Filter min reviews per id (asin) ...")
print(f"  before: {len(df)}")
df = filter_min_review_freq(df, min_ratings=5)
print(f"  after:  {len(df)}")

print("Filter both good/bad per id (asin) ...")
print(f"  before: {len(df)}")
df = filter_both_good_bad(df)
print(f"  after:  {len(df)}")

In [None]:
if "topic" not in df.columns:
    df["topic"] = "amazon"

inv_bid_cats = make_inv_id2topic(make_inv_topic2id(df))
inv_bid_cats = {k: [v] for k, v in inv_bid_cats.items()}

In [None]:
df["goodness"].value_counts()

In [None]:
# inv_bid_cats = dict()

num_pairs_per_class = 2
num_pairs_negative = 2 * num_pairs_per_class

#pairs_good, pairs_bad = make_pairs_good_bad(df, inv_bid_cats, num_pairs_per_class=num_pairs_per_class)
#pairs_neg = make_pairs_negative(df, inv_bid_cats, num_pairs_negative, repeatable_on_side=False)
#print(f"#good: {len(pairs_good)}, #bad: {len(pairs_bad)}, #neg {len(pairs_neg)}")

In [None]:
set_seed(42)

#df_traindev = make_or_load_pairs(df, inv_bid_cats, str(fn_amazon_kindle_df), num_pairs_per_class=2)
df_traindev = make_or_load_pairs(df, inv_bid_cats, str(fn_amazon_df), num_pairs_per_class=2)
traindev_df = df_traindev

---

Store test set?

In [None]:
fn_amazon_df = data_amazon_path / "df_traindev_test.p"

In [None]:
# store
traindev_df, test_df = split_df(traindev_df, ratio=0.1, do_shuffle=True, random_state=42, name_train="traindev", name_dev="test")

with open(fn_amazon_df, "wb") as fp:
    pickle.dump(traindev_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(test_df, fp, protocol=pickle.HIGHEST_PROTOCOL)

---

Write train/dev/test sets

In [None]:
with open(fn_amazon_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

In [None]:
root_path = data_amazon_b_tdt_path
#root_path = data_amazon_b_rand_tdt_path

In [None]:
write_pair_tdt_tsv(root_path, traindev_df, split_test=0.1, split_dev=0.3)

In [None]:
# symlink pred.tsv
! ln -s test.tsv {root_path}/pred.tsv

In [None]:
print("@ ", root_path, "\n")
! ls -lh {root_path}

---

In [None]:
! rm data/sentiment/amazon-pair-b/cached_*

In [None]:
#model_name = "bert-base-uncased"
#model_name = "bert-base-cased"
#model_name = "distilroberta-base"
model_name = "distilbert-base-cased"
#model_name = "albert-base-v2"

data_name = "amazon-pair-b"
#data_name = "amazon-pair-rand-b" ## over businesses

seq_len = 256
batch_size = 32
acc_steps = 64
num_epoch = 2
cuda_devs = "1"

run_name = f"{model_name.replace('/', '-')}-{data_name}_{seq_len}_{batch_size}-acc{acc_steps}_{num_epoch}"

In [None]:
# create folder for logging
! mkdir -p ./output_sent_logs/{run_name}

! \
    CUDA_VISIBLE_DEVICES={cuda_devs} \
    python trainer.py \
    --do_train --do_eval --do_test \
    --model_name_or_path {model_name} \
    --task_name same-b \
    --data_dir ./data/sentiment/{data_name} \
    --output_dir ./output_sent/{run_name} \
    --run_name {run_name} \
    --per_device_eval_batch_size {batch_size} \
    --per_device_train_batch_size {batch_size} \
    --gradient_accumulation_steps {acc_steps} \
    --logging_steps 5000 \
    --save_steps 10000 \
    --save_total_limit 3 \
    --num_train_epochs {num_epoch} \
    --max_seq_length {seq_len} \
    --evaluation_strategy epoch \
    > >(tee -a ./output_sent_logs/{run_name}/stdout.log) \
    2> >(tee -a ./output_sent_logs/{run_name}/stderr.log >&2)

# --overwrite_output_dir \
# --overwrite_cache \
# --eval_steps 128 \
# --evaluation_strategy steps \
# --load_best_model_at_end \

In [None]:
if False:
    # _csv.Error: line contains NUL
    from pathlib import Path
    fn = Path("data/sentiment/amazon-pair-b/dev.tsv")
    fn.write_text(" ".join(fn.read_text().split("\0")))

---

In [None]:
# evaluate senti

run_name_ = run_name
task_name = "yelp-pair-b"
load_name = f"./output_sent/{run_name}"
run_name = f"{run_name}-senti-{task_name}"
log_dir = f"./output_sent_logs/{run_name}"

# create folder for logging
! mkdir -p {log_dir}

! \
    CUDA_VISIBLE_DEVICES={cuda_devs} \
    python trainer.py \
    --do_test \
    --model_name_or_path {load_name} \
    --task_name same-b \
    --data_dir ./data/sentiment/{task_name} \
    --output_dir ./output/{run_name} \
    --overwrite_output_dir \
    --overwrite_cache \
    --run_name {run_name} \
    --max_seq_length {seq_len} \
    --per_device_eval_batch_size {batch_size} \
    --logging_steps 100 \
    > >(tee -a {log_dir}/stdout.log) \
    2> >(tee -a {log_dir}/stderr.log >&2)

# task_name = yelp-pair-b / yelp-pair-rand-b / amazon-pair-b
# --data_dir ./data/sentiment/{task_name} \
# task_name = cross / within / artificial
# --data_dir ./data/argmining_emnlp21/{task_name} \