# SameSentiment Yelp - Cross-Eval - Siamese

In [38]:
import json
import os
import pickle
from itertools import groupby
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

from utils import Timer

In [5]:
data_yelp_path = Path("data_raw/sentiment/yelp/")

# local - output path (base) for sentiment review yelp pairs
data_yelp_b_tdt_path = Path("data/sentiment/yelp-pair-b/")

---

In [6]:
dn_yelp_cached = data_yelp_path / "cached"

In [None]:
#  #### Load categories & topics
from data_prep import load_reviews, load_topics

# ##### Filter categories
from data_prep import filter_min_cat_combis, make_map_cats, make_cat_combis

# ##### Filter reviews
from data_prep import filter_min_review_freq, filter_both_good_bad

# ##### Filter businesses
from data_prep import filter_by_businesses, filter_by_businesses_not_same

# #### Load category tree
from data_prep import load_category_tree
from data_prep import get_root_category_items, get_children_category_item_list
from data_prep import get_businesses_in_category, get_businesses_in_category_branch


# #### Cache root category reviews in dataframes
from data_prep import cache_root_category_businesses_df, load_cached_root_category_businesses_df


# #### Positive + negative same-sentiment pairs
from data_prep import make_pairs_good_bad

# #### Not same-sentiment pairs (combinations positive + negative)
from data_prep import make_pairs_negative

# #### Dataframe for training etc.
from data_prep import make_or_load_pairs


# #### Make train/dev/test splits
from data_prep import split_df, write_pair_df_tsv, write_pair_tdt_tsv


# ### Make cross eval splits
from data_prep import build_category_business_lookup
from data_prep import filter_category_business_lookup_no_overlap

# #### Filter non-overlapping from pairs
from data_prep import df_add_business_id
from data_prep import filter_overlapping_businesses

# #### Manually split into N shards for cross-validation
from data_prep import make_group_split
from data_prep import make_cross_eval_dfs


# ### load predictions for evaluation
from utils_siamese import load_predictions, load_truths, Split, compute_metrics, NumpyEncoder

In [8]:
try:
    from utils import init_random

    init_random()
except ImportError:
    pass

try:
    from utils_siamese import set_seed

    set_seed(42)
except ImportError:
    pass

---

## Training and evaluation data

In [9]:
n_split = 4

In [10]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

In [None]:
fn_yelp_topics = data_yelp_path / "business.json"
bids_not_cats = set()
inv_bid_cats = load_topics(fn_yelp_topics, bids_not_cats=bids_not_cats)

inv_cat_bids = make_map_cats(inv_bid_cats)

inv_cat_combis = make_cat_combis(inv_bid_cats)

In [12]:
fn_yelp_catgory_tree = data_yelp_path / "all_category_list.json"
map_categories, map_cat_name2id, lst_root_categories = load_category_tree(fn_yelp_catgory_tree)

In [13]:
lookup_rootcat_bid = build_category_business_lookup(map_categories, inv_cat_bids, map_cat_name2id)
lookup_rootcat_bid_no_overlap = filter_category_business_lookup_no_overlap(lookup_rootcat_bid)

In [14]:
fn_group = data_yelp_path / f"group_data_save_k={n_split}.p"

with open(fn_group, "rb") as fp:
    groups = pickle.load(fp)
    map_cg_train_dev_groups = pickle.load(fp)

In [15]:
groups
# map_cg_train_dev_groups.keys()

[(('Active Life', 'active'),
  ('Arts & Entertainment', 'arts'),
  ('Automotive', 'auto'),
  ('Beauty & Spas', 'beautysvc'),
  ('Education', 'education')),
 (('Event Planning & Services', 'eventservices'),
  ('Financial Services', 'financialservices'),
  ('Food', 'food'),
  ('Health & Medical', 'health'),
  ('Home Services', 'homeservices')),
 (('Hotels & Travel', 'hotelstravel'),
  ('Local Flavor', 'localflavor'),
  ('Local Services', 'localservices'),
  ('Mass Media', 'massmedia'),
  ('Nightlife', 'nightlife')),
 (('Pets', 'pets'),
  ('Professional Services', 'professional'),
  ('Public Services & Government', 'publicservicesgovt'),
  ('Religious Organizations', 'religiousorgs'),
  ('Restaurants', 'restaurants'),
  ('Shopping', 'shopping'))]

---

#### Run cross eval

In [16]:
run_name_base_data = f"manual-cross-eval-{n_split}"
run_name_base = f"manual-cross-eval-{n_split}-siamese"
fn_data_base_path = "data/sentiment"

num_epochs = 15
seq_len = 256
batch_size = 512
gpu_num = "1"

---

In [None]:
# eval only (group)
for i, (cg_ids, (train_df, dev_df, dev_dfs)) in enumerate(map_cg_train_dev_groups.items()):
    print(f"Eval model on group fold {cg_ids} #{len(train_df)}...")

    run_name = f"{run_name_base}_g{i}"
    run_ext = f"_{seq_len}_{batch_size}_{num_epochs}"

    fn_data_path = Path(f"{fn_data_base_path}/{run_name_base_data}/{run_name_base_data}_g{i}")
    print(fn_data_path)
    assert fn_data_path.exists()

    fn_run_path = Path(f"output/{run_name_base}/{run_name}{run_ext}")
    fn_run_path.mkdir(parents=True, exist_ok=True)

    with Timer(f"evaluate model"):
        ! CUDA_VISIBLE_DEVICES={gpu_num} \
            python trainer_siamese.py \
            --do_eval \
            --data_dir {fn_data_path} \
            --output_dir {fn_run_path} \
            --model_name_or_path {fn_run_path} \
            --fn_vectors data_raw/glove.6B.50d.txt \
            --embedding_dim 50 \
            --num_epochs $num_epochs \
            --per_device_train_batch_size $batch_size \
            --per_device_eval_batch_size $batch_size \
            --max_seq_length $seq_len \
            --overwrite_cache
        
    print("\n" + ("#" * 60) + "\n")

In [None]:
# eval cross single groups
for i, (cg_ids, (train_df, dev_df, dev_dfs)) in enumerate(map_cg_train_dev_groups.items()):
    print(f"Eval cross single groups, fold {cg_ids} ...")

    run_name = f"{run_name_base}_g{i}"
    run_ext = f"_{seq_len}_{batch_size}_{num_epochs}"
    fn_run_path = Path(f"output/{run_name_base}/{run_name}{run_ext}")
    fn_data_path = Path(f"{fn_data_base_path}/{run_name_base_data}/{run_name_base_data}_g{i}")
    assert fn_run_path.exists()
    assert fn_data_path.exists()
        
    cg_groups = [cg_ids_o for cg_ids_o in groups if cg_ids_o != cg_ids]
    for cgi, (cg_ids_o, dev_df) in enumerate(zip(cg_groups, dev_dfs)):
        name = f"group-{cgi}"
        dn_dev = fn_data_path / name
        fn_run_group_path = fn_run_path / name
        fn_run_group_path.mkdir(parents=True, exist_ok=True)
        
        with Timer(f"evaluate model {cgi}: {cg_ids_o}"):
            ! CUDA_VISIBLE_DEVICES={gpu_num} \
                python trainer_siamese.py \
                --do_eval \
                --data_dir {dn_dev} \
                --output_dir {fn_run_group_path} \
                --model_name_or_path {fn_run_path} \
                --fn_vectors data_raw/glove.6B.50d.txt \
                --embedding_dim 50 \
                --num_epochs $num_epochs \
                --per_device_train_batch_size $batch_size \
                --per_device_eval_batch_size $batch_size \
                --max_seq_length $seq_len \
                --overwrite_cache

        print("\n" + ("#" * 60) + "\n")

In [None]:
# eval cross single categories
for i, (cg_ids, (train_df, dev_df, dev_dfs)) in enumerate(map_cg_train_dev_groups.items()):
    print(f"Eval cross single categories, fold {cg_ids} ...")

    run_name = f"{run_name_base}_g{i}"
    run_ext = f"_{seq_len}_{batch_size}_{num_epochs}"
    fn_run_path = Path(f"output/{run_name_base}/{run_name}{run_ext}")
    fn_data_path = Path(f"{fn_data_base_path}/{run_name_base_data}/{run_name_base_data}_g{i}")
    assert fn_run_path.exists()
    assert fn_data_path.exists()
    
    for cgi, (cg_id, businesses) in enumerate(lookup_rootcat_bid_no_overlap.items()):
        # check if train data, skip since we did not split into eval
        if cg_id in cg_ids:
            continue
        if not businesses:
            print(f"Skip empty df: {cg_id}")
            continue

        name = f"categ-{cgi}"
        dn_dev = fn_data_path / name
        fn_run_group_path = fn_run_path / name
        fn_run_group_path.mkdir(parents=True, exist_ok=True)
        
        with Timer(f"evaluate model {cgi}: {cg_id}"):
            ! CUDA_VISIBLE_DEVICES={gpu_num} \
                python trainer_siamese.py \
                --do_eval \
                --data_dir {dn_dev} \
                --output_dir {fn_run_group_path} \
                --model_name_or_path {fn_run_path} \
                --fn_vectors data_raw/glove.6B.50d.txt \
                --embedding_dim 50 \
                --num_epochs $num_epochs \
                --per_device_train_batch_size $batch_size \
                --per_device_eval_batch_size $batch_size \
                --max_seq_length $seq_len \
                --overwrite_cache

        print("\n" + ("#" * 60) + "\n")

---

- based on: https://github.com/Querela/argmining19-same-side-classification/blob/same-sentiment/SentimentDatasets_Yelp_Categories-CrossEval.ipynb

In [24]:
def do_metrics(fn_data_path, fn_run_path, mode, precision=8, store=True):
    y_true = load_truths(fn_data_path, mode)
    y_pred = load_predictions(fn_run_path, mode)

    metrics = compute_metrics(y_true, y_pred, precision=precision)

    if store:
        output_metrics_file = os.path.join(fn_run_path, f"{mode.value!s}_metrics.json")
        with open(output_metrics_file, "w") as fp:
            json.dump(metrics, fp, indent=4, cls=NumpyEncoder)

    return metrics

In [None]:
mode = Split.dev  # "dev"

cg_results = dict()
cg_results_single_group = dict()
cg_results_single_cat = dict()

for i, (cg_ids, (train_df, dev_df, dev_dfs)) in enumerate(map_cg_train_dev_groups.items()):
    print(f"Load predictions {cg_ids}...")

    run_name = f"{run_name_base}_g{i}"
    run_ext = f"_{seq_len}_{batch_size}_{num_epochs}"
    fn_run_path = Path(f"output/{run_name_base}/{run_name}{run_ext}")
    fn_data_path = Path(f"{fn_data_base_path}/{run_name_base_data}/{run_name_base_data}_g{i}")
    assert fn_data_path.exists()
    assert fn_run_path.exists()

    m = do_metrics(fn_data_path, fn_run_path, mode)
    cg_results[cg_ids] = m
        
    # ------------------

    cg_groups = [cg_ids_o for cg_ids_o in groups if cg_ids_o != cg_ids]
    for cgi, (cg_ids_o, dev_df) in enumerate(zip(cg_groups, dev_dfs)):
        name = f"group-{cgi}"
        dn_dev = fn_data_path / name
        fn_run_group_path = fn_run_path / name

        m = do_metrics(dn_dev, fn_run_group_path, mode)
        cg_results_single_group[(cg_ids, cg_ids_o, i, cgi)]  = m
        
    # ------------------

    for cgi, (cg_id, businesses) in enumerate(lookup_rootcat_bid_no_overlap.items()):
        # check if train data, skip since we did not split into eval
        if cg_id in cg_ids:
            continue
        if not businesses:
            print(f"Skip empty df: {cg_id}")
            continue

        name = f"categ-{cgi}"
        dn_dev = fn_data_path / name
        fn_run_group_path = fn_run_path / name
        
        m = do_metrics(dn_dev, fn_run_group_path, mode)
        cg_results_single_cat[(cg_ids, cg_id, i, cgi)] = m

    # old:
    # cg_results[cg_ids] = compute_metrics(y_true, y_pred, precision=8, averaging="macro", dump=False)

---

In [26]:
fn_cg_results = Path(f"output/{run_name_base}/cg_results.p")

In [27]:
with open(fn_cg_results, "wb") as fp:
    pickle.dump(cg_results, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(cg_results_single_group, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(cg_results_single_cat, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [28]:
with open(fn_cg_results, "rb") as fp:
    cg_results = pickle.load(fp)
    cg_results_single_group = pickle.load(fp)
    cg_results_single_cat = pickle.load(fp)

In [29]:
cur_cat = None
for cg_ids, m in cg_results.items():
    if cur_cat is None or cur_cat != cg_ids:
        cur_cat = cg_ids
        print(f"\n\n{cg_ids}\n")
    (prec, rec, f1, acc) = m["precision"], m["recall"], m["fscore"], m["accuracy"]
    print(f"P:{prec * 100:5.2f}%  R:{rec * 100:5.2f}%  F:{f1 * 100:5.2f}%  A:{acc * 100:5.2f}%")



(('Active Life', 'active'), ('Arts & Entertainment', 'arts'), ('Automotive', 'auto'), ('Beauty & Spas', 'beautysvc'), ('Education', 'education'))

P:80.89%  R:82.85%  F:81.86%  A:81.61%


(('Event Planning & Services', 'eventservices'), ('Financial Services', 'financialservices'), ('Food', 'food'), ('Health & Medical', 'health'), ('Home Services', 'homeservices'))

P:81.28%  R:83.77%  F:82.51%  A:82.20%


(('Hotels & Travel', 'hotelstravel'), ('Local Flavor', 'localflavor'), ('Local Services', 'localservices'), ('Mass Media', 'massmedia'), ('Nightlife', 'nightlife'))

P:81.95%  R:84.35%  F:83.13%  A:82.85%


(('Pets', 'pets'), ('Professional Services', 'professional'), ('Public Services & Government', 'publicservicesgovt'), ('Religious Organizations', 'religiousorgs'), ('Restaurants', 'restaurants'), ('Shopping', 'shopping'))

P:83.05%  R:82.49%  F:82.77%  A:82.80%


In [30]:
cur_cat = None
for (cg_ids, cg_ids_o, i, cgi), m in cg_results_single_group.items():
    if cur_cat is None or cur_cat != cg_ids:
        cur_cat = cg_ids
        print(f"\n\n{cg_ids}\n")
    (prec, rec, f1, acc) = m["precision"], m["recall"], m["fscore"], m["accuracy"]
    print(f"P:{prec * 100:5.2f}%  R:{rec * 100:5.2f}%  F:{f1 * 100:5.2f}%  A:{acc * 100:5.2f}% -- {cg_ids_o}")



(('Active Life', 'active'), ('Arts & Entertainment', 'arts'), ('Automotive', 'auto'), ('Beauty & Spas', 'beautysvc'), ('Education', 'education'))

P:84.06%  R:85.99%  F:85.01%  A:84.83% -- (('Event Planning & Services', 'eventservices'), ('Financial Services', 'financialservices'), ('Food', 'food'), ('Health & Medical', 'health'), ('Home Services', 'homeservices'))
P:79.62%  R:82.64%  F:81.10%  A:80.72% -- (('Hotels & Travel', 'hotelstravel'), ('Local Flavor', 'localflavor'), ('Local Services', 'localservices'), ('Mass Media', 'massmedia'), ('Nightlife', 'nightlife'))
P:80.92%  R:83.13%  F:82.01%  A:81.78% -- (('Pets', 'pets'), ('Professional Services', 'professional'), ('Public Services & Government', 'publicservicesgovt'), ('Religious Organizations', 'religiousorgs'), ('Restaurants', 'restaurants'), ('Shopping', 'shopping'))


(('Event Planning & Services', 'eventservices'), ('Financial Services', 'financialservices'), ('Food', 'food'), ('Health & Medical', 'health'), ('Home Servic

In [31]:
cur_cat = None
for (cg_ids, cg_id, i, cgi), m in cg_results_single_cat.items():
    if cur_cat is None or cur_cat != cg_ids:
        cur_cat = cg_ids
        print(f"\n\n{cg_ids}\n")
    (prec, rec, f1, acc) = m["precision"], m["recall"], m["fscore"], m["accuracy"]
    print(f"P:{prec * 100:5.2f}%  R:{rec * 100:5.2f}%  F:{f1 * 100:5.2f}%  A:{acc * 100:5.2f}% -- {cg_id}")



(('Active Life', 'active'), ('Arts & Entertainment', 'arts'), ('Automotive', 'auto'), ('Beauty & Spas', 'beautysvc'), ('Education', 'education'))

P:86.05%  R:86.25%  F:86.15%  A:86.21% -- ('Event Planning & Services', 'eventservices')
P:86.17%  R:89.01%  F:87.57%  A:87.48% -- ('Financial Services', 'financialservices')
P:77.78%  R:80.80%  F:79.26%  A:78.87% -- ('Food', 'food')
P:90.71%  R:91.47%  F:91.09%  A:91.03% -- ('Health & Medical', 'health')
P:91.76%  R:91.99%  F:91.87%  A:91.85% -- ('Home Services', 'homeservices')
P:82.25%  R:84.78%  F:83.49%  A:83.31% -- ('Hotels & Travel', 'hotelstravel')
P:73.33%  R:78.57%  F:75.86%  A:75.44% -- ('Local Flavor', 'localflavor')
P:89.64%  R:89.73%  F:89.68%  A:89.70% -- ('Local Services', 'localservices')
P:77.78%  R:75.00%  F:76.36%  A:77.19% -- ('Mass Media', 'massmedia')
P:74.47%  R:78.73%  F:76.54%  A:75.74% -- ('Nightlife', 'nightlife')
P:90.34%  R:91.05%  F:90.69%  A:90.68% -- ('Pets', 'pets')
P:92.44%  R:89.83%  F:91.12%  A:91.22% -

In [44]:
# min/max for cross-groups
for k, g in groupby(cg_results_single_group.items(), key=lambda x: x[0][0]):
    print(f"\n{k}\n")
    cur_min = cur_max = None
    all_acc = list()
    for (cg_ids, cg_ids_o, i, cgi), m in g:
        acc = m["accuracy"]
        all_acc.append(acc)
        if cur_min is None or cur_min > acc:
            cur_min = acc
        if cur_max is None or cur_max < acc:
            cur_max = acc
    print(f"A:{cur_min * 100:5.2f}% - {cur_max * 100:5.2f}%")
    print(f"avg: {np.mean(all_acc) * 100:.2f}%, var: {np.var(all_acc) * 100:.2f}")


(('Active Life', 'active'), ('Arts & Entertainment', 'arts'), ('Automotive', 'auto'), ('Beauty & Spas', 'beautysvc'), ('Education', 'education'))

A:80.72% - 84.83%
avg: 82.44%, var: 0.03

(('Event Planning & Services', 'eventservices'), ('Financial Services', 'financialservices'), ('Food', 'food'), ('Health & Medical', 'health'), ('Home Services', 'homeservices'))

A:81.42% - 88.84%
avg: 84.23%, var: 0.11

(('Hotels & Travel', 'hotelstravel'), ('Local Flavor', 'localflavor'), ('Local Services', 'localservices'), ('Mass Media', 'massmedia'), ('Nightlife', 'nightlife'))

A:83.11% - 89.37%
avg: 86.16%, var: 0.07

(('Pets', 'pets'), ('Professional Services', 'professional'), ('Public Services & Government', 'publicservicesgovt'), ('Religious Organizations', 'religiousorgs'), ('Restaurants', 'restaurants'), ('Shopping', 'shopping'))

A:84.00% - 90.53%
avg: 87.39%, var: 0.07


In [45]:
# min/max for cross-category
for k, g in groupby(cg_results_single_cat.items(), key=lambda x: x[0][0]):
    print(f"\n{k}\n")
    cur_min = cur_max = None
    all_acc = list()
    for (cg_ids, cg_id, i, cgi), m in g:
        acc = m["accuracy"]
        all_acc.append(acc)
        if cur_min is None or cur_min > acc:
            cur_min = acc
        if cur_max is None or cur_max < acc:
            cur_max = acc
    print(f"acc: {cur_min * 100:5.2f}% - {cur_max * 100:5.2f}%")
    print(f"avg: {np.mean(all_acc) * 100:.2f}%, var: {np.var(all_acc) * 100:.2f}")


(('Active Life', 'active'), ('Arts & Entertainment', 'arts'), ('Automotive', 'auto'), ('Beauty & Spas', 'beautysvc'), ('Education', 'education'))

acc: 73.64% - 91.85%
avg: 83.28%, var: 0.40

(('Event Planning & Services', 'eventservices'), ('Financial Services', 'financialservices'), ('Food', 'food'), ('Health & Medical', 'health'), ('Home Services', 'homeservices'))

acc: 75.84% - 91.42%
avg: 83.60%, var: 0.32

(('Hotels & Travel', 'hotelstravel'), ('Local Flavor', 'localflavor'), ('Local Services', 'localservices'), ('Mass Media', 'massmedia'), ('Nightlife', 'nightlife'))

acc: 76.78% - 92.78%
avg: 86.06%, var: 0.32

(('Pets', 'pets'), ('Professional Services', 'professional'), ('Public Services & Government', 'publicservicesgovt'), ('Religious Organizations', 'religiousorgs'), ('Restaurants', 'restaurants'), ('Shopping', 'shopping'))

acc: 77.19% - 93.51%
avg: 86.62%, var: 0.34


In [34]:
# compute average accuracy over single categories
cur_cat = None
map_cg_cat_avg = dict()
for (cg_ids, cg_id, i, cgi), m in cg_results_single_cat.items():
    if cur_cat is None or cur_cat != cg_ids:
        cur_cat = cg_ids
        map_cg_cat_avg[cg_ids] = list()
    (prec, rec, f1, acc) = m["precision"], m["recall"], m["fscore"], m["accuracy"]
    map_cg_cat_avg[cg_ids].append(acc)

for cg_ids, vals in map_cg_cat_avg.items():
    print(f"avg: {np.mean(vals) * 100:.2f}%, var: {np.var(vals) * 100:.2f}     {cg_ids}")

avg: 83.28%, var: 0.40     (('Active Life', 'active'), ('Arts & Entertainment', 'arts'), ('Automotive', 'auto'), ('Beauty & Spas', 'beautysvc'), ('Education', 'education'))
avg: 83.60%, var: 0.32     (('Event Planning & Services', 'eventservices'), ('Financial Services', 'financialservices'), ('Food', 'food'), ('Health & Medical', 'health'), ('Home Services', 'homeservices'))
avg: 86.06%, var: 0.32     (('Hotels & Travel', 'hotelstravel'), ('Local Flavor', 'localflavor'), ('Local Services', 'localservices'), ('Mass Media', 'massmedia'), ('Nightlife', 'nightlife'))
avg: 86.62%, var: 0.34     (('Pets', 'pets'), ('Professional Services', 'professional'), ('Public Services & Government', 'publicservicesgovt'), ('Religious Organizations', 'religiousorgs'), ('Restaurants', 'restaurants'), ('Shopping', 'shopping'))


---