# Dataset preparation & Experiments - SameSentiment Yelp

In [None]:
import pickle
from pathlib import Path

In [None]:
# download + scp to server + extract
data_yelp_path = Path("data/sentiment/yelp/")

# ------------------------------------

# local?
data_yelp_path = Path("data_raw/sentiment/yelp/")

# local - output path for simple sentiment reviews yelp
data_yelp_tdt_sentiment_5_path = Path("data/sentiment/yelp-sentiment-5/")
data_yelp_tdt_sentiment_b_path = Path("data/sentiment/yelp-sentiment-b/")

---

In [None]:
#  #### Load categories & topics
from data_prep import load_reviews, load_topics

# ##### Filter categories
from data_prep import filter_min_cat_combis, make_map_cats, make_cat_combis

# ##### Filter reviews
from data_prep import filter_min_review_freq, filter_both_good_bad

# ##### Filter businesses
from data_prep import filter_by_businesses, filter_by_businesses_not_same

# #### Load category tree
from data_prep import load_category_tree
from data_prep import get_root_category_items, get_children_category_item_list
from data_prep import get_businesses_in_category, get_businesses_in_category_branch


# #### Cache root category reviews in dataframes
from data_prep import cache_root_category_businesses_df, load_cached_root_category_businesses_df


# #### Make train/dev/test splits
from data_prep import split_df, write_single_2_df_tsv, write_single_5_df_tsv, write_single_tdt_tsv

---

In [None]:
model_name = "distilroberta-base"
data_name = "yelp-pair-b"
seq_len = 512
batch_size = 16
num_epoch = 3
cuda_devs = "0"

run_name = f"{model_name}-{data_name}_{seq_len}_{batch_size}_{num_epoch}"

In [None]:
! CUDA_VISIBLE_DEVICES={cuda_devs} \
    python trainer.py \
    --do_train --do_eval --do_test \
    --model_name_or_path {model_name} \
    --task_name same-b \
    --data_dir ./data/sentiment/{data_name} \
    --output_dir ./output/{run_name} \
    --run_name {run_name} \
    --per_device_eval_batch_size {batch_size} \
    --per_device_train_batch_size {batch_size} \
    --logging_steps 5000 \
    --save_steps 5000 \
    --num_train_epochs {num_epoch} \
    --max_seq_length {seq_len} \
    --evaluation_strategy epoch \
    --overwrite_output_dir \
    --overwrite_cache

---

## Write out single sentiment reviews

In [None]:
fn_yelp_reviews = data_yelp_path / "review.json"
df = load_reviews(fn_yelp_reviews)

#### 5-class (1-5 stars)

In [None]:
root_path = data_yelp_tdt_sentiment_5_path

write_single_tdt_tsv(root_path, df, split_test=0.1, split_dev=0.3, do_shuffle=True, random_state=42, binary=False)

#### binary (1 / 0)

In [None]:
root_path = data_yelp_tdt_sentiment_b_path

write_single_tdt_tsv(root_path, df, split_test=0.1, split_dev=0.3, do_shuffle=True, random_state=42, binary=True)

In [None]:
print("@ ", root_path, "\n")
! ls -lh {root_path}

#### Run ...

Task: 5-class

```bash
python trainer.py --do_train --do_eval --model_name_or_path bert-base-uncased --task_name sent-5 --data_dir ./data/sentiment/yelp-sentiment-5 --output_dir ./output/yelp-sentiment-5 --run_name yelp-sentiment-5 --logging_steps 10000 --num_train_epochs 2 --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --max_seq_length 128

# eval
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_eval --model_name_or_path ./output/yelp-sentiment-5/checkpoint-10000/ --task_name sent-5 --data_dir ./data/sentiment/yelp-sentiment-5 --output_dir ./output/yelp-sentiment-5 --run_name yelp-sentiment-5 --logging_steps 10000 --num_train_epochs 2 --per_device_train_batch_size 32 --per_device_eval_batch_size 64 --max_seq_length 128
```

Task: binary (2-class)

```bash
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_train --do_eval --model_name_or_path bert-base-uncased --task_name sent-b --data_dir ./data/sentiment/yelp-sentiment-b --output_dir ./output/yelp-sentiment-b --run_name yelp-sentiment-b --logging_steps 10000 --num_train_epochs 1 --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --max_seq_length 128 --save_steps 2000 --max_steps 10000
```

Task: regression

```bash
# eval
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_eval --model_name_or_path ./output/yelp-sentiment-r/ --task_name sent-r --data_dir ./data/sentiment/yelp-sentiment-b --output_dir ./output/yelp-sentiment-r --run_name yelp-sentiment-r --per_device_eval_batch_size 128 --max_seq_length 128
```

SeqLen: 512

```bash
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_train --do_eval --model_name_or_path bert-base-uncased --task_name sent-b --data_dir ./data/sentiment/yelp-sentiment-b --output_dir ./output/yelp-sentiment-b_512_16_1 --run_name yelp-sentiment-b_512_16_1 --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_seq_length 512 --max_steps 10000 --save_steps 2000 --logging_steps 2000 --logging_first_step
```

---

Model: **distilroberta-base**  
Task: binary (2-class)

```bash
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_train --do_eval --model_name_or_path distilroberta-base --task_name sent-b --data_dir ./data/sentiment/yelp-sentiment-b --output_dir ./output/distilroberta-base-yelp-sentiment-b --run_name distilroberta-base-yelp-sentiment-b --logging_steps 10000 --num_train_epochs 1 --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --max_seq_length 128 --save_steps 2000 --max_steps 10000
```

```bash
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_predict --model_name_or_path distilroberta-base --task_name sent-b --data_dir ./data/sentiment/yelp-sentiment-b --output_dir ./output/distilroberta-base-yelp-sentiment-b --run_name distilroberta-base-yelp-sentiment-b --logging_steps 10000 --num_train_epochs 1 --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --max_seq_length 128 --save_steps 2000 --max_steps 10000
```

In [None]:
! CUDA_VISIBLE_DEVICES=0 python trainer.py --do_test --model_name_or_path distilroberta-base --task_name sent-b --data_dir ./data/sentiment/yelp-sentiment-b --output_dir ./output/distilroberta-base-yelp-sentiment-b --run_name distilroberta-base-yelp-sentiment-b --logging_steps 10000 --num_train_epochs 1 --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --max_seq_length 128 --save_steps 2000 --max_steps 10000