# SameSentiment Yelp - Cross Evaluation

In [None]:
import pickle
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from utils import Timer

tqdm.pandas()

In [None]:
# download + scp to server + extract
data_yelp_path = Path("data/sentiment/yelp/")

# ------------------------------------

# local?
data_yelp_path = Path("data_raw/sentiment/yelp/")

# local? - output path (base) for sentiment review yelp pairs
data_yelp_b_tdt_path = Path("data/sentiment/yelp-pair-b/")
data_yelp_b_rand_tdt_path = Path("data/sentiment/yelp-pair-rand-b/")
# local - output path for simple sentiment reviews yelp
data_yelp_tdt_sentiment_5_path = Path("data/sentiment/yelp-sentiment-5/")
data_yelp_tdt_sentiment_b_path = Path("data/sentiment/yelp-sentiment-b/")

---

In [None]:
dn_yelp_cached = data_yelp_path / "cached"

In [None]:
#  #### Load categories & topics
from data_prep import load_reviews, load_topics

# ##### Filter categories
from data_prep import filter_min_cat_combis, make_map_cats, make_cat_combis

# ##### Filter reviews
from data_prep import filter_min_review_freq, filter_both_good_bad

# ##### Filter businesses
from data_prep import filter_by_businesses, filter_by_businesses_not_same

# #### Load category tree
from data_prep import load_category_tree
from data_prep import get_root_category_items, get_children_category_item_list
from data_prep import get_businesses_in_category, get_businesses_in_category_branch


# #### Cache root category reviews in dataframes
from data_prep import cache_root_category_businesses_df, load_cached_root_category_businesses_df


# #### Positive + negative same-sentiment pairs
from data_prep import make_pairs_good_bad

# #### Not same-sentiment pairs (combinations positive + negative)
from data_prep import make_pairs_negative

# #### Dataframe for training etc.
from data_prep import make_or_load_pairs


# #### Make train/dev/test splits
from data_prep import split_df, write_pair_df_tsv, write_pair_tdt_tsv


# ### Make cross eval splits
from data_prep import build_category_business_lookup
from data_prep import filter_category_business_lookup_no_overlap

# #### Filter non-overlapping from pairs
from data_prep import df_add_business_id
from data_prep import filter_overlapping_businesses

# #### Manually split into N shards for cross-validation
from data_prep import make_group_split
from data_prep import make_cross_eval_dfs

---

In [None]:
# N positive + N negative
# --> 2N pos+neg (not same-sentiment)
num_pairs_per_class = 2

#: number of negative same-sentiment samples same as positive same-sentiment samples
num_pairs_negative = 2 * num_pairs_per_class

#: whether for a single side (good or bad) there can be multiple occurrences of the same review
#: may need to check afterwared that not by chance same pairing happens ...
repeatable_on_side = False

---

## Run

#### Load reviews

In [None]:
fn_yelp_reviews = data_yelp_path / "review.json"
df = load_reviews(fn_yelp_reviews)

#### Load categories for businesses

- business (id) with list of topics/categories
- lookups (business -> categories, category -> businesses)
- list of combinations (with amount)

In [None]:
fn_yelp_topics = data_yelp_path / "business.json"
bids_not_cats = set()
inv_bid_cats = load_topics(fn_yelp_topics, bids_not_cats=bids_not_cats)

inv_cat_bids = make_map_cats(inv_bid_cats)

inv_cat_combis = make_cat_combis(inv_bid_cats)

#### Load category tree

- hierarchy of categories

In [None]:
fn_yelp_catgory_tree = data_yelp_path / "all_category_list.json"
map_categories, map_cat_name2id, lst_root_categories = load_category_tree(fn_yelp_catgory_tree)

#### Pre-Cache all root category businesses (reviews)

In [None]:
cache_root_category_businesses_df(df, inv_cat_bids, map_categories, map_cat_name2id)

---

## Stats

In [None]:
# number of businesses
print(f"Number of businesses total: {len(inv_bid_cats.keys())}")
# number of reviews (total)
print(f"Number of reviews total: {df.rid.count()}")

---

## Make train pairs (normal)

In [None]:
fn_yelp_df = data_yelp_path / "df_traindev.p"

df = filter_min_review_freq(df, min_ratings=5)
df = filter_both_good_bad(df)

df_traindev = make_or_load_pairs(df, str(fn_yelp_df), num_pairs_per_class=2)

### Make train pairs (double, typed)

In [None]:
fn_yelp_df = data_yelp_path / "df_traindev4_typed.p"

df = filter_min_review_freq(df, min_ratings=8)
df = filter_both_good_bad(df)

df_traindev = make_or_load_pairs(df, str(fn_yelp_df), num_pairs_per_class=4)

---

## Write out training data

### Split Test-Data

In [None]:
fn_yelp_df = data_yelp_path / "df_traindev4_typed.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)

In [None]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"

In [None]:
# store
traindev_df, test_df = split_df(traindev_df, ratio=0.1, do_shuffle=True, random_state=42, name_train="traindev", name_dev="test")

with open(fn_yelp_df, "wb") as fp:
    pickle.dump(traindev_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(test_df, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

---

In [None]:
if False:
    fn_yelp_df = data_yelp_path / "df_traindev.p"
    fn_yelp_df = data_yelp_path / "df_traindev4_typed.p"

    assert fn_yelp_df.exists()

    # see make_or_load_pairs
    with open(fn_yelp_df, "rb") as fp:
        traindev_df = pickle.load(fp)

    len(traindev_df)

In [None]:
root_path = data_yelp_b_tdt_path

write_pair_tdt_tsv(root_path, traindev_df, split_test=0.1, split_dev=0.3)

### Make cross eval splits

In [None]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

In [None]:
lookup_rootcat_bid = build_category_business_lookup(map_categories, inv_cat_bids, map_cat_name2id)
lookup_rootcat_bid_no_overlap = filter_category_business_lookup_no_overlap(lookup_rootcat_bid)

In [None]:
print(f"Number businesses all (overlapping): {sum(len(bs) for bs in lookup_rootcat_bid.values())}")
print(f"Number businesses all (no overlap): {sum(len(bs) for bs in lookup_rootcat_bid_no_overlap.values())}")

pd.DataFrame([
    {
        # "category": f"{title}[{alias}]",
        "category": title,
        "alias": alias,
        "businesses": len(businesses),
        "business (no overlap)": len(business_no_overlap)
    }
    for ((title, alias), businesses), business_no_overlap in zip(lookup_rootcat_bid.items(), lookup_rootcat_bid_no_overlap.values())    
])

---

#### Filter non-overlapping from pairs

In [None]:
traindev_df = df_add_business_id(traindev_df)
traindev_df = filter_overlapping_businesses(traindev_df, lookup_rootcat_bid_no_overlap)

#### Manually split into N shards for cross-validation

In [None]:
n_split = 7

In [None]:
groups = make_group_split(lookup_rootcat_bid_no_overlap, n=n_split)
map_cg_train_dev_groups = make_cross_eval_dfs(traindev_df, groups, lookup_rootcat_bid_no_overlap)

In [None]:
fn_group = data_yelp_path / f"group_data_save_k={n_split}.p"

with open(fn_group, "wb") as fp:
    pickle.dump(groups, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(map_cg_train_dev_groups, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fn_group = data_yelp_path / f"group_data_save_k={n_split}.p"

with open(fn_group, "rb") as fp:
    groups = pickle.load(fp)
    map_cg_train_dev_groups = pickle.load(fp)

---

#### Run cross eval

In [None]:
run_name_base = "manual-cross-eval-2"
num_epochs = 3
seq_len = 256
batch_size = 16
gpu_num = "1"

In [None]:
for i, (cg_ids, (train_df, dev_df, dev_dfs)) in enumerate(map_cg_train_dev_groups.items()):
    print(f"Train model on group fold {cg_ids} #{len(train_df)}...")

    run_name = f"{run_name_base}_g{i}"
    run_ext = f"_{seq_len}_{batch_size}_{num_epochs}"

    fn_data_path = Path(f"data/sentiment/{run_name_base}/{run_name}")
    if not fn_data_path.exists():
        fn_data_path.mkdir(parents=True, exist_ok=True)

    fn_run_path = Path(f"output/{run_name_base}/{run_name}{run_ext}")
    if not fn_run_path.exists():
        fn_run_path.mkdir(parents=True, exist_ok=True)

    with Timer(f"write data"):
        fn_group = fn_data_path / "group_data.p"
        with open(fn_group, "wb") as fp:
            pickle.dump(cg_ids, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(train_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(dev_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(dev_dfs, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(groups, fp, protocol=pickle.HIGHEST_PROTOCOL)

        write_pair_tdt_tsv(fn_data_path, traindev_df, split_test=0.1, split_dev=0.3)

    with Timer(f"train model - {num_epochs}"):
        ! CUDA_VISIBLE_DEVICES={gpu_num} \
            python trainer.py \
            --do_train \
            --model_name_or_path distilroberta-base \
            --task_name same-b \
            --data_dir {fn_data_path} \
            --output_dir {fn_run_path} \
            --run_name {run_name}{run_ext} \
            --per_device_train_batch_size $batch_size \
            --logging_steps 10000 \
            --save_steps 2000 \
            --num_train_epochs $num_epochs \
            --max_seq_length $seq_len \
            --evaluation_strategy epoch \
            --overwrite_output_dir

    with Timer(f"evaluate model"):
        ! CUDA_VISIBLE_DEVICES={gpu_num} \
            python trainer.py \
            --do_eval \
            --model_name_or_path distilroberta-base \
            --task_name same-b \
            --data_dir {fn_data_path} \
            --output_dir {fn_run_path} \
            --run_name {run_name}{run_ext} \
            --per_device_eval_batch_size $batch_size \
            --max_seq_length $seq_len \
            --overwrite_cache