# SameSentiment Yelp - Pair Evaluation

In [None]:
import pickle
from pathlib import Path

from tqdm import tqdm
from transformers.trainer_utils import set_seed

tqdm.pandas()

In [None]:
# download + scp to server + extract
data_yelp_path = Path("data/sentiment/yelp/")

# ------------------------------------

# local?
data_yelp_path = Path("data_raw/sentiment/yelp/")

# local? - output path (base) for sentiment review yelp pairs
data_yelp_b_tdt_path = Path("data/sentiment/yelp-pair-b/")
data_yelp_b_rand_tdt_path = Path("data/sentiment/yelp-pair-rand-b/")
# local - output path for simple sentiment reviews yelp
data_yelp_tdt_sentiment_5_path = Path("data/sentiment/yelp-sentiment-5/")
data_yelp_tdt_sentiment_b_path = Path("data/sentiment/yelp-sentiment-b/")

---

In [3]:
dn_yelp_cached = data_yelp_path / "cached"

In [4]:
#  #### Load categories & topics
from data_prep import load_reviews, load_topics

# ##### Filter categories
from data_prep import filter_min_cat_combis, make_map_cats, make_cat_combis

# ##### Filter reviews
from data_prep import filter_min_review_freq, filter_both_good_bad


# #### Load category tree
from data_prep import load_category_tree


# #### Cache root category reviews in dataframes
from data_prep import cache_root_category_businesses_df, load_cached_root_category_businesses_df


# #### Dataframe for training etc.
from data_prep import make_or_load_pairs


# #### Make train/dev/test splits
from data_prep import split_df, write_pair_df_tsv, write_pair_tdt_tsv


---

## Run

#### Load reviews

In [5]:
fn_yelp_reviews = data_yelp_path / "review.json"
df = load_reviews(fn_yelp_reviews)

6685900it [00:47, 140619.36it/s]


#### Load categories for businesses

- business (id) with list of topics/categories
- lookups (business -> categories, category -> businesses)
- list of combinations (with amount)

In [None]:
fn_yelp_topics = data_yelp_path / "business.json"
bids_not_cats = set()
inv_bid_cats = load_topics(fn_yelp_topics, bids_not_cats=bids_not_cats)

inv_cat_bids = make_map_cats(inv_bid_cats)

inv_cat_combis = make_cat_combis(inv_bid_cats)

#### Load category tree

- hierarchy of categories

In [7]:
fn_yelp_catgory_tree = data_yelp_path / "all_category_list.json"
map_categories, map_cat_name2id, lst_root_categories = load_category_tree(fn_yelp_catgory_tree)

#### Pre-Cache all root category businesses (reviews)

In [8]:
cache_root_category_businesses_df(df, inv_cat_bids, map_categories, map_cat_name2id)

---

#### Make pairs

In [9]:
if False:
    set_seed(42)

    fn_yelp_df = data_yelp_path / "df_traindev4_typed.p"
    # delete if it exists, else it will only be loaded ...
    if fn_yelp_df.exists():
        print(f"Remove prior dataframe: {fn_yelp_df}")
        fn_yelp_df.unlink()

    df = filter_min_review_freq(df, min_ratings=8)
    df = filter_both_good_bad(df)

    traindev_df = make_or_load_pairs(df, inv_cat_bids, str(fn_yelp_df), num_pairs_per_class=4)

    fn_yelp_df = data_yelp_path / "df_traindev_test.p"

    # store
    traindev_df, test_df = split_df(traindev_df, ratio=0.1, do_shuffle=True, random_state=42, name_train="traindev", name_dev="test")

    with open(fn_yelp_df, "wb") as fp:
        pickle.dump(traindev_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_df, fp, protocol=pickle.HIGHEST_PROTOCOL)

---

#### Load data

In [10]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

In [11]:
print(f"num samples total: {len(test_df)}")

# check how many pairs are per pairing
print("train/dev:")
for pairtype, df_grouped in traindev_df.groupby(["type"]):
    print(f"- {pairtype}: {len(df_grouped)}")
    # df_grouped.describe()

print("test:")
for pairtype, df_grouped in test_df.groupby(["type"]):
    print(f"- {pairtype}: {len(df_grouped)}")

num samples total: 70376
train/dev:
- bad-bad: 158255
- bad-good: 158356
- good-bad: 158236
- good-good: 158537
test:
- bad-bad: 17685
- bad-good: 17584
- good-bad: 17704
- good-good: 17403


## Run test evaluation

In [14]:
model_name = "bert-base-uncased"
#model_name = "bert-base-cased"
#model_name = "distilroberta-base"

data_name = "yelp-pair-b"
#data_name = "yelp-pair-rand-b" ## over businesses

seq_len = 256
batch_size = 16
acc_steps = 64
num_epoch = 3
cuda_devs = "0"

run_name = f"{model_name}-{data_name}_{seq_len}_{batch_size}-acc{acc_steps}_{num_epoch}"

#! mkdir ./output_sent/{run_name}

In [15]:
for pairtype, df_grouped in test_df.groupby(["type"]):
    print(f"Eval {pairtype}: {len(df_grouped)}")

    fn_data_path = Path(f"data/sentiment/{run_name}/{pairtype}")
    fn_data_path.mkdir(parents=True, exist_ok=True)

    write_pair_df_tsv(df_grouped, fn_data_path / "test.tsv", "test")
    
    ! CUDA_VISIBLE_DEVICES={cuda_devs} \
        python trainer.py \
        --do_test \
        --model_name_or_path ./output_sent/{run_name} \
        --task_name same-b \
        --data_dir {fn_data_path} \
        --output_dir ./output_sent/{run_name}/pairtype/{pairtype} \
        --run_name {run_name}-{pairtype} \
        --per_device_eval_batch_size {batch_size} \
        --max_seq_length {seq_len}

    print()

test: 100%|██████████| 17685/17685 [00:00<00:00, 303879.04it/s]

Eval bad-bad: 17685





[INFO|trainer.py:204] 02/01/2021 22:07:48 >> Training/evaluation parameters MyTrainingArguments(output_dir='./output_sent/bert-base-uncased-yelp-pair-b_256_16-acc64_3/pairtype/bad-bad', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Feb01_22-07-48_cuda2', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_

test:   0%|          | 0/17584 [00:00<?, ?it/s]


Eval bad-good: 17584


test: 100%|██████████| 17584/17584 [00:00<00:00, 172089.27it/s]


[INFO|trainer.py:204] 02/01/2021 22:09:38 >> Training/evaluation parameters MyTrainingArguments(output_dir='./output_sent/bert-base-uncased-yelp-pair-b_256_16-acc64_3/pairtype/bad-good', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Feb01_22-09-38_cuda2', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader

test:   0%|          | 0/17704 [00:00<?, ?it/s]


Eval good-bad: 17704


test: 100%|██████████| 17704/17704 [00:00<00:00, 163155.44it/s]


[INFO|trainer.py:204] 02/01/2021 22:11:29 >> Training/evaluation parameters MyTrainingArguments(output_dir='./output_sent/bert-base-uncased-yelp-pair-b_256_16-acc64_3/pairtype/good-bad', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Feb01_22-11-29_cuda2', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader

test:   0%|          | 0/17403 [00:00<?, ?it/s]


Eval good-good: 17403


test: 100%|██████████| 17403/17403 [00:00<00:00, 176131.92it/s]


[INFO|trainer.py:204] 02/01/2021 22:13:20 >> Training/evaluation parameters MyTrainingArguments(output_dir='./output_sent/bert-base-uncased-yelp-pair-b_256_16-acc64_3/pairtype/good-good', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Feb01_22-13-20_cuda2', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloade