## Softcosine Clusters
**Purpose:** Export samples of clusters per similarity threshold to CSV
- Convert clusters from .jsonl files into tidy-formatted dataframes

In [1]:
import os
import json
import random
import pandas as pd

In [2]:
jsonl_dir = os.path.join("..", "..", "data", "02-intermediate", "06-newsevents", "05-softcosine-clusters", "jsonl")
df_dir = os.path.join("..", "..", "data", "02-intermediate", "06-newsevents", "05-softcosine-clusters", "dataframes")
sample_dir = os.path.join("..", "..", "data", "02-intermediate", "06-newsevents", "05-softcosine-clusters-sample")

### Functions

In [3]:
def load_clusters(jsonl_file):
    """Load fetched content from .jsonl file.
    
    Args:
        jsonl_file (str): path + filename for jsonl file
    
    Yields:
        cluster as JSON stringified object

    """
    with open(file=jsonl_file, mode="r", encoding="utf-8") as file:
        for line in file:
            data = json.loads(line)
            yield data

In [4]:
def flatten_docs(cluster):
    """Flatten the docs within a cluster"""
    flat_docs = []
   
    for doc in cluster["docs"]:
        d = {}
        d["cluster_id"]=cluster["cluster_id"]
        d["cluster_size"]=cluster["cluster_size"]
        d["doc_id"]=doc["_id"]
        d["doc_publish_date"]=doc["publish_date"]
        d["doc_title"]=doc["title"]
        flat_docs.append(d)

    # ensure articles within a cluster are sorted from oldest to newest
    flat_docs = sorted(flat_docs, key=lambda k: k["doc_publish_date"], reverse=False)
    
    return flat_docs

In [5]:
def make_df(clusters, similarity_threshold, export_pkl=True):
    """Flatten clusters and return them in a tidy-formatted dataframe. Option to export dataframe to pkl."""
    dfs = []
    for c in clusters:
        f = flatten_docs(c)
        dfs.append(pd.DataFrame(f))
    merged_df = pd.concat(dfs)
    merged_df = merged_df.reset_index(drop=True)
    
    if export_pkl:
        merged_df.to_pickle(os.path.join(df_dir, f"clusters_{similarity_threshold}.pkl"))
        
    return merged_df

### Export flattened version of clusters for each similarity threshold

In [6]:
df_dict = {}
for n in range(2, 10):
    jsonl_file = os.path.join(jsonl_dir, f"clusters_softcos0{n}.jsonl")
    clusters = [c for c in load_clusters(jsonl_file)]
    df = make_df(clusters, f"softcos0{n}", export_pkl=True)
    df_dict[f"softcos0{n}"]=df

### Export samples for each similarity threshold

In [7]:
def make_df_sample(df_dict, similarity_threshold, export_csv=True):
    """Make a dataframe which contains a sample of multi-article clusters for a particular similarity threshold.
    The sample contains 100 clusters. The 10 biggest clusters are always included, and the remaining 90 are randomly chosen.
    """
    
    df = df_dict[similarity_threshold]
    
    # only multi-article clusters are included
    df = df.loc[df["cluster_size"]>1]
    
    # false positives are likelier in large clusters so make sure to check the 10 biggest clusters
    df_distinct_ids = df.drop_duplicates(subset=["cluster_id"])
    df_distinct_ids = df_distinct_ids.sort_values(by="cluster_size",ascending=False).reset_index(drop=True)
    top_ids = list(df_distinct_ids.iloc[0:10]["cluster_id"])
    other_ids = list(df_distinct_ids.iloc[10:]["cluster_id"])
    
    # sample the other ids
    random.seed(42)
    sampled_cluster_ids = top_ids + random.sample(other_ids, 5)
    sample_ids = [f"{similarity_threshold}_sample_{n}" for n, s_id in enumerate(sampled_cluster_ids)]
    cluster2sample_ids = dict(zip(sampled_cluster_ids, sample_ids))

    df_sample = df.loc[df["cluster_id"].isin(sampled_cluster_ids)].reset_index(drop=True)
    df_sample["sample_id"] = df_sample["cluster_id"].map(lambda c: cluster2sample_ids[c])
    df_sample["cluster_type"] = ""
    df_sample["doc_misassigned"] = ""
    df_sample["notes"] = ""
    
    df_sample["sample_order"] = df_sample["sample_id"].map(lambda s: int(s[s.rfind("_")+1:]))
    df_sample = df_sample.sort_values(by="sample_order", ascending=True).reset_index(drop=True)
    
    # cluster_type: "newsevent", "issue"
    # doc_is_error
    df_sample = df_sample[["sample_id", "cluster_id", "cluster_size",
                           "doc_id", "doc_publish_date",
                           "cluster_type",
                           "doc_misassigned", "notes",
                           "doc_title"]]
    
    if export_csv:
        df_sample.to_csv(os.path.join(sample_dir, f"clusters_{similarity_threshold}_sample.csv"), index=False)

    return df_sample

In [8]:
for n in range(2,10):
    make_df_sample(df_dict, f"softcos0{n}", export_csv=True)