## Softcosine Clusters
**Purpose:** Add the assigned `cluster_id`s to each outlet's document.

**Steps**:
1. Create a list of dictionaries where each dict corresponds to a `doc_id`.
    - Each dict contains the `doc_id` and cluster assignment keys (i.e., `softcos02_id`, `softcos03_id`, etc.)
2. modify INCA so it can update a document based on a `doc_id` and add multiple new fields in one-go.

In [1]:
import os
import pandas as pd
from collections import defaultdict
from operator import itemgetter
import copy

In [2]:
# matplotlib is logged even though disable_existing_loggers=yes in logging_config.yaml
# https://stackoverflow.com/a/51529172/7016397
# workaround is to manually set the level before creating my logger
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '05-softcosine-clusters-inca', logger_type='main')

In [3]:
from usrightmedia.shared.es_queries import query_by_ids

In [4]:
from inca import Inca
myinca = Inca()

In [5]:
df_dir = os.path.join("..", "..", "data", "02-intermediate", "06-newsevents", "05-softcosine-clusters", "dataframes")

In [6]:
def get_cluster_assignments(similarity_threshold):
    """Get the documents' assigned clusters for a particular similarity threshold.
    
    Args:
        similarity_threshold (str): e.g., "softcos02"
        
    Returns:
        d2c_dicts (list of dicts): each dict contains two key-value pairs
            keys: doc_id, [similarity threshold]_id
    
    """
    df = pd.read_pickle(os.path.join(df_dir, f"clusters_{similarity_threshold}.pkl"))
    d2c_dicts = df[["doc_id", "cluster_id"]].to_dict("records")
    for d in d2c_dicts:
        d[f"{similarity_threshold}_id"] = d.pop("cluster_id")
    return d2c_dicts

In [7]:
# The key in dict_thresholds is the threshold string (e.g., "softcos02").
# The value in dict_thresholds is the dataframe with the cluster_id per doc_id.
# Each dataframe across all thresholds has the same number of rows.
# Each row represents a document and is assigned a cluster_id.
# The cluster_id can correspond to a single-article or multi-article cluster.
dict_thresholds = {}
for n in range(2, 10):
    df_threshold = get_cluster_assignments(f"softcos0{n}")
    dict_thresholds[f"softcos0{n}"] = df_threshold

In [8]:
# Each doc is assigned a cluster_id per threshold
# examples: "softcos08"
dict_thresholds["softcos08"][-5:-2]

[{'doc_id': 'FoxNews_430089147', 'softcos08_id': 'softcos08_168917'},
 {'doc_id': 'FoxNews_430021195', 'softcos08_id': 'softcos08_168918'},
 {'doc_id': 'DailyCaller_713193397', 'softcos08_id': 'softcos08_168919'}]

In [9]:
# examples: "softcos09"
dict_thresholds["softcos09"][-5:-2]

[{'doc_id': 'FoxNews_430089147', 'softcos09_id': 'softcos09_171948'},
 {'doc_id': 'FoxNews_430021195', 'softcos09_id': 'softcos09_171949'},
 {'doc_id': 'DailyCaller_713193397', 'softcos09_id': 'softcos09_171950'}]

In [10]:
# This step consolidates the cluster assignments for each doc_id.
# The output is a list of dicts where each dict represents a doc and its various cluster assignments.

# https://stackoverflow.com/a/5501893
d = defaultdict(dict)
for threshold in dict_thresholds.values():
    for doc in threshold:
        d[doc["doc_id"]].update(doc)

docs = sorted(d.values(), key=itemgetter("doc_id"))
docs_inca = copy.deepcopy(docs)

In [11]:
# this step renames 'doc_id' to '_id'
for doc in docs_inca:
    doc["_id"] = doc["doc_id"]
    doc.pop("doc_id")

In [12]:
# after running this cell, each dict in docs_inca will no longer have an "_id" as its popped off in INCA
myinca.database.update_documents(docs_inca, batchsize=2000)

100%|██████████| 87/87 [00:22<00:00,  3.83it/s]


In [13]:
# spot-check in Kibana
docs[-5:-3]

[{'doc_id': 'WashingtonExaminer_999968188',
  'softcos02_id': 'softcos02_67',
  'softcos03_id': 'softcos03_452',
  'softcos04_id': 'softcos04_454',
  'softcos05_id': 'softcos05_377',
  'softcos06_id': 'softcos06_9184',
  'softcos07_id': 'softcos07_5143',
  'softcos08_id': 'softcos08_1888',
  'softcos09_id': 'softcos09_645'},
 {'doc_id': 'WashingtonExaminer_999968698',
  'softcos02_id': 'softcos02_3526',
  'softcos03_id': 'softcos03_5803',
  'softcos04_id': 'softcos04_5326',
  'softcos05_id': 'softcos05_3802',
  'softcos06_id': 'softcos06_3094',
  'softcos07_id': 'softcos07_23553',
  'softcos08_id': 'softcos08_20064',
  'softcos09_id': 'softcos09_18305'}]