# Dataset preparation - SameSentiment Yelp - Base

In [1]:
import pickle
from pathlib import Path

from tqdm import tqdm
from transformers.trainer_utils import set_seed

tqdm.pandas()

In [2]:
# download + scp to server + extract
data_yelp_path = Path("data/sentiment/yelp/")

# ------------------------------------

# local?
data_yelp_path = Path("data_raw/sentiment/yelp/")

# local? - output path (base) for sentiment review yelp pairs
data_yelp_b_tdt_path = Path("data/sentiment/yelp-pair-b/")
data_yelp_b_rand_tdt_path = Path("data/sentiment/yelp-pair-rand-b/")
# local - output path for simple sentiment reviews yelp
data_yelp_tdt_sentiment_5_path = Path("data/sentiment/yelp-sentiment-5/")
data_yelp_tdt_sentiment_b_path = Path("data/sentiment/yelp-sentiment-b/")

---

In [3]:
dn_yelp_cached = data_yelp_path / "cached"

In [4]:
#  #### Load categories & topics
from data_prep import load_reviews, load_topics

# ##### Filter categories
from data_prep import filter_min_cat_combis, make_map_cats, make_cat_combis

# ##### Filter reviews
from data_prep import filter_min_review_freq, filter_both_good_bad

# ##### Filter businesses
from data_prep import filter_by_businesses, filter_by_businesses_not_same

# #### Load category tree
from data_prep import load_category_tree
from data_prep import get_root_category_items, get_children_category_item_list
from data_prep import get_businesses_in_category, get_businesses_in_category_branch


# #### Cache root category reviews in dataframes
from data_prep import cache_root_category_businesses_df, load_cached_root_category_businesses_df


# #### Positive + negative same-sentiment pairs
from data_prep import make_pairs_good_bad
from data_prep import make_pairs_good_bad_over_business

# #### Not same-sentiment pairs (combinations positive + negative)
from data_prep import make_pairs_negative
from data_prep import make_pairs_negative_over_business

# #### Dataframe for training etc.
from data_prep import make_or_load_pairs
from data_prep import make_or_load_pairs_over_businesses


# #### Make train/dev/test splits
from data_prep import split_df, write_pair_df_tsv, write_pair_tdt_tsv


---

In [5]:
# N positive + N negative
# --> 2N pos+neg (not same-sentiment)
num_pairs_per_class = 2

#: number of negative same-sentiment samples same as positive same-sentiment samples
num_pairs_negative = 2 * num_pairs_per_class

#: whether for a single side (good or bad) there can be multiple occurrences of the same review
#: may need to check afterwared that not by chance same pairing happens ...
repeatable_on_side = False

---

## Write out training data

### Split Test-Data

In [6]:
fn_yelp_df = data_yelp_path / "df_traindev4_typed.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)

In [7]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"
#fn_yelp_df = data_yelp_path / "df_traindev_test_over_business.p"
#traindev_df = df_traindev

In [8]:
# store
traindev_df, test_df = split_df(traindev_df, ratio=0.1, do_shuffle=True, random_state=42, name_train="traindev", name_dev="test")

with open(fn_yelp_df, "wb") as fp:
    pickle.dump(traindev_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(test_df, fp, protocol=pickle.HIGHEST_PROTOCOL)

traindev: [0:673099], test: [673099:747888], ratio: 0.1


---

In [9]:
with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

In [10]:
root_path = data_yelp_b_tdt_path
#root_path = data_yelp_b_rand_tdt_path

In [11]:
write_pair_tdt_tsv(root_path, traindev_df, split_test=0.1, split_dev=0.3)

traindev: [0:605789], test: [605789:673099], ratio: 0.1
train: [0:424052], dev: [424052:605789], ratio: 0.3


train: 100%|██████████| 424052/424052 [00:01<00:00, 215631.68it/s]
dev: 100%|██████████| 181737/181737 [00:00<00:00, 216654.97it/s]
test: 100%|██████████| 67310/67310 [00:00<00:00, 210823.98it/s]


In [12]:
# symlink pred.tsv
! ln -s test.tsv {root_path}/pred.tsv

ln: failed to create symbolic link 'data/sentiment/yelp-pair-b/pred.tsv': File exists


In [13]:
print("@ ", root_path, "\n")
! ls -lh {root_path}

@  data/sentiment/yelp-pair-b 

total 774M
-rw-r--r-- 1 jupyter-dunphya20 jupyter-dunphya20 209M Apr  3 16:05 dev.tsv
lrwxrwxrwx 1 jupyter-dunphya20 jupyter-dunphya20    8 Apr  3 16:04 pred.tsv -> test.tsv
-rw-r--r-- 1 jupyter-dunphya20 jupyter-dunphya20  78M Apr  3 16:05 test.tsv
-rw-r--r-- 1 jupyter-dunphya20 jupyter-dunphya20 487M Apr  3 16:05 train.tsv


In [14]:
#model_name = "bert-base-uncased"
#model_name = "bert-base-cased"
model_name = "distilroberta-base"
#model_name = "distilbert-base-cased"
#model_name = "albert-base-v2"

data_name = "yelp-pair-b"
#data_name = "yelp-pair-rand-b" ## over businesses

seq_len = 256
batch_size = 16
acc_steps = 64
num_epoch = 3
cuda_devs = "1"

run_name = f"{model_name.replace('/', '-')}-{data_name}_{seq_len}_{batch_size}-acc{acc_steps}_{num_epoch}"

In [None]:
# create folder for logging
! mkdir -p ./output_sent_logs/{run_name}

! \
    MLFLOW_EXPERIMENT_NAME=same-sentiment \
    CUDA_VISIBLE_DEVICES={cuda_devs} \
    python trainer.py \
    --do_train --do_eval --do_test \
    --model_name_or_path {model_name} \
    --task_name same-b \
    --data_dir ./data/sentiment/{data_name} \
    --output_dir ./output_sent/{run_name} \
    --run_name {run_name} \
    --per_device_eval_batch_size {batch_size} \
    --per_device_train_batch_size {batch_size} \
    --gradient_accumulation_steps {acc_steps} \
    --logging_steps 5000 \
    --save_steps 10000 \
    --save_total_limit 3 \
    --num_train_epochs {num_epoch} \
    --max_seq_length {seq_len} \
    --evaluation_strategy epoch \
    > >(tee -a ./output_sent_logs/{run_name}/stdout.log) \
    2> >(tee -a ./output_sent_logs/{run_name}/stderr.log >&2)

# --overwrite_output_dir \
# --overwrite_cache \
# --eval_steps 128 \
# --evaluation_strategy steps \
# --load_best_model_at_end \
# HF_MLFLOW_LOG_ARTIFACTS=TRUE \

[INFO|trainer.py:185] 04/03/2022 16:06:06 >> Training/evaluation parameters MyTrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_test=True,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=64,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,

- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[INFO|datasets.py:162] 04/03/2022 16:06:10 >> Creating features from dataset file at ./data/sentiment/yelp-pair-b


Train ...

```bash
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_train --do_eval --model_name_or_path bert-base-uncased --task_name same-b --data_dir ./data/sentiment/yelp-pair-b --output_dir ./output/yelp-pair-b_128_32_3 --run_name yelp-pair-b_128_32_3 --per_device_eval_batch_size 32 --per_device_train_batch_size 32 --logging_steps 10000 --save_steps 2000 --num_train_epochs 3 --max_seq_length 128
```

Test ...

```bash
CUDA_VISIBLE_DEVICES=1 python trainer.py --do_test --model_name_or_path bert-base-uncased --task_name same-b --data_dir ./data/sentiment/yelp-pair-b --output_dir ./output/yelp-pair-b_128_32_3 --run_name yelp-pair-b_128_32_3 --per_device_eval_batch_size 32 --logging_steps 10000 --max_seq_length 128
```

In [None]:
! CUDA_VISIBLE_DEVICES=0 python trainer.py --do_eval --do_test --model_name_or_path bert-base-uncased --task_name same-b --data_dir ./data/sentiment/yelp-pair-b --output_dir ./output/yelp-pair-b_128_32_3 --run_name yelp-pair-b_128_32_3 --per_device_eval_batch_size 32 --logging_steps 10000 --max_seq_length 128

Model: **distilroberta-base**

```bash
CUDA_VISIBLE_DEVICES=0 python trainer.py --do_train --do_eval --do_test --model_name_or_path distilroberta-base --task_name same-b --data_dir ./data/sentiment/yelp-pair-b --output_dir ./output/distilroberta-base-yelp-pair-b_128_32_3 --run_name distilroberta-base-yelp-pair-b_128_32_3 --per_device_eval_batch_size 32 --per_device_train_batch_size 32 --logging_steps 10000 --save_steps 2000 --num_train_epochs 3 --max_seq_length 128
```