# SameSentiment Yelp - CrossEval - Data Setup

In [1]:
import pickle
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from utils import Timer

tqdm.pandas()

In [2]:
data_yelp_path = Path("data_raw/sentiment/yelp/")

# local - output path (base) for sentiment review yelp pairs
data_yelp_b_tdt_path = Path("data/sentiment/yelp-pair-b/")
# local - output path for simple sentiment reviews yelp
data_yelp_tdt_sentiment_5_path = Path("data/sentiment/yelp-sentiment-5/")
data_yelp_tdt_sentiment_b_path = Path("data/sentiment/yelp-sentiment-b/")

---

In [3]:
dn_yelp_cached = data_yelp_path / "cached"

In [4]:
#  #### Load categories & topics
from data_prep import load_reviews, load_topics

# ##### Filter categories
from data_prep import filter_min_cat_combis, make_map_cats, make_cat_combis

# ##### Filter reviews
from data_prep import filter_min_review_freq, filter_both_good_bad

# ##### Filter businesses
from data_prep import filter_by_businesses, filter_by_businesses_not_same

# #### Load category tree
from data_prep import load_category_tree
from data_prep import get_root_category_items, get_children_category_item_list
from data_prep import get_businesses_in_category, get_businesses_in_category_branch


# #### Cache root category reviews in dataframes
from data_prep import cache_root_category_businesses_df, load_cached_root_category_businesses_df


# #### Positive + negative same-sentiment pairs
from data_prep import make_pairs_good_bad

# #### Not same-sentiment pairs (combinations positive + negative)
from data_prep import make_pairs_negative

# #### Dataframe for training etc.
from data_prep import make_or_load_pairs


# #### Make train/dev/test splits
from data_prep import split_df, write_pair_df_tsv, write_pair_tdt_tsv


# ### Make cross eval splits
from data_prep import build_category_business_lookup
from data_prep import filter_category_business_lookup_no_overlap

# #### Filter non-overlapping from pairs
from data_prep import df_add_business_id
from data_prep import filter_overlapping_businesses

# #### Manually split into N shards for cross-validation
from data_prep import make_group_split
from data_prep import make_cross_eval_dfs

---

In [5]:
# N positive + N negative
# --> 2N pos+neg (not same-sentiment)
num_pairs_per_class = 2

#: number of negative same-sentiment samples same as positive same-sentiment samples
num_pairs_negative = 2 * num_pairs_per_class

#: whether for a single side (good or bad) there can be multiple occurrences of the same review
#: may need to check afterwared that not by chance same pairing happens ...
repeatable_on_side = False

In [6]:
try:
    from utils import init_random

    init_random()
except ImportError:
    pass

try:
    from utils_siamese import set_seed

    set_seed(42)
except ImportError:
    pass    

---

## Run

#### Load reviews

In [7]:
fn_yelp_reviews = data_yelp_path / "review.json"
df = load_reviews(fn_yelp_reviews)

3001it [00:00, 91140.91it/s]


#### Load categories for businesses

- business (id) with list of topics/categories
- lookups (business -> categories, category -> businesses)
- list of combinations (with amount)

In [8]:
fn_yelp_topics = data_yelp_path / "business.json"
bids_not_cats = set()
inv_bid_cats = load_topics(fn_yelp_topics, bids_not_cats=bids_not_cats)

inv_cat_bids = make_map_cats(inv_bid_cats)

inv_cat_combis = make_cat_combis(inv_bid_cats)

150346it [00:02, 67646.90it/s]
100%|██████████| 150243/150243 [00:00<00:00, 517704.83it/s]
100%|██████████| 150243/150243 [00:00<00:00, 422970.00it/s]


#### Load category tree

- hierarchy of categories

In [9]:
fn_yelp_catgory_tree = data_yelp_path / "all_category_list.json"
map_categories, map_cat_name2id, lst_root_categories = load_category_tree(fn_yelp_catgory_tree)

#### Pre-Cache all root category businesses (reviews)

In [10]:
cache_root_category_businesses_df(df, inv_cat_bids, map_categories, map_cat_name2id)

---

## Write out training data

### Split Test-Data

In [11]:
fn_yelp_df = data_yelp_path / "df_traindev4_typed.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)

In [12]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"

In [13]:
with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

---

### Make cross eval splits

In [14]:
fn_yelp_df = data_yelp_path / "df_traindev_test.p"

with open(fn_yelp_df, "rb") as fp:
    traindev_df = pickle.load(fp)
    test_df = pickle.load(fp)

In [15]:
lookup_rootcat_bid = build_category_business_lookup(map_categories, inv_cat_bids, map_cat_name2id)
lookup_rootcat_bid_no_overlap = filter_category_business_lookup_no_overlap(lookup_rootcat_bid)

In [16]:
print(f"Number businesses all (overlapping): {sum(len(bs) for bs in lookup_rootcat_bid.values())}")
print(f"Number businesses all (no overlap): {sum(len(bs) for bs in lookup_rootcat_bid_no_overlap.values())}")

pd.DataFrame([
    {
        # "category": f"{title}[{alias}]",
        "category": title,
        "alias": alias,
        "businesses": len(businesses),
        "business (no overlap)": len(business_no_overlap)
    }
    for ((title, alias), businesses), business_no_overlap in zip(lookup_rootcat_bid.items(), lookup_rootcat_bid_no_overlap.values())    
])

Number businesses all (overlapping): 221837
Number businesses all (no overlap): 94395


Unnamed: 0,category,alias,businesses,business (no overlap)
0,Active Life,active,7687,3391
1,Arts & Entertainment,arts,5434,718
2,Automotive,auto,10773,8187
3,Beauty & Spas,beautysvc,14292,9702
4,Bicycles,bicycles,5,0
5,Education,education,1936,347
6,Event Planning & Services,eventservices,9895,914
7,Financial Services,financialservices,1487,732
8,Food,food,27781,7733
9,Health & Medical,health,11890,6839


---

#### Filter non-overlapping from pairs

In [17]:
traindev_df = df_add_business_id(traindev_df)
traindev_df = filter_overlapping_businesses(traindev_df, lookup_rootcat_bid_no_overlap)

  total = df.size // df.shape[axis]
1it [00:00, 2870.84it/s]


AttributeError: 'DataFrame' object has no attribute 'business_id'

#### Manually split into N shards for cross-validation

In [None]:
n_split = 4

In [None]:
groups = make_group_split(lookup_rootcat_bid_no_overlap, n=n_split)
map_cg_train_dev_groups = make_cross_eval_dfs(traindev_df, groups, lookup_rootcat_bid_no_overlap)

In [None]:
fn_group = data_yelp_path / f"group_data_save_k={n_split}.p"

with open(fn_group, "wb") as fp:
    pickle.dump(groups, fp, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(map_cg_train_dev_groups, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fn_group = data_yelp_path / f"group_data_save_k={n_split}.p"

with open(fn_group, "rb") as fp:
    groups = pickle.load(fp)
    map_cg_train_dev_groups = pickle.load(fp)

---

#### Run cross eval

In [18]:
run_name_base = f"manual-cross-eval-{n_split}"
fn_data_base_path = "data/sentiment"

NameError: name 'n_split' is not defined

In [19]:
for i, (cg_ids, (train_df, dev_df, dev_dfs)) in enumerate(map_cg_train_dev_groups.items()):
    print(f"Write data for group fold {cg_ids} #{len(train_df)}...")

    run_name = f"{run_name_base}_g{i}"

    fn_data_path = Path(f"{fn_data_base_path}/{run_name_base}/{run_name}")
    fn_data_path.mkdir(parents=True, exist_ok=True)

    with Timer(f"write data"):
        fn_group = fn_data_path / "group_data.p"
        with open(fn_group, "wb") as fp:
            pickle.dump(cg_ids, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(train_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(dev_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(dev_dfs, fp, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(groups, fp, protocol=pickle.HIGHEST_PROTOCOL)

        write_pair_tdt_tsv(fn_data_path, traindev_df, split_test=0.1, split_dev=0.3)
        
    # ------------------
        
    cg_groups = [cg_ids_o for cg_ids_o in groups if cg_ids_o != cg_ids]
    for cgi, (cg_ids_o, dev_df) in enumerate(zip(cg_groups, dev_dfs)):
        fn_dev_tsv = fn_data_path / f"group-{cgi}" / "dev.tsv"
        write_pair_df_tsv(dev_df, fn_dev_tsv, desc="dev")
        
    # ------------------
    
    for cgi, (cg_id, businesses) in enumerate(lookup_rootcat_bid_no_overlap.items()):
        # check if train data, skip since we did not split into eval
        if cg_id in cg_ids:
            continue
        if not businesses:
            print(f"Skip empty df: {cg_id}")
            continue
            
        dev_df = traindev_df[traindev_df.business_id.isin(businesses)]

        fn_dev_tsv = fn_data_path / f"categ-{cgi}" / "dev.tsv"
        write_pair_df_tsv(dev_df, fn_dev_tsv, desc="dev")

        
    # cg_results[cg_ids] = compute_metrics(y_true, y_pred, precision=8, averaging="macro", dump=False)

NameError: name 'map_cg_train_dev_groups' is not defined