# GLiNER Finetuned Model Output analysis on PRWP documents

In [None]:
%%capture
!pip install rapidfuzz

In [None]:
import pandas as pd
import glob

#fnames2 = []
#fnames2 = glob.glob("/kaggle/input/onedrive-prwp-outputs/all_prwp_finetuned_output_part1/output/*.json")
#fnames1 = glob.glob("/kaggle/input/onedrive-prwp-outputs/all_prwp_finetuned_output_part2/output/*.json")

fnames = glob.glob("/kaggle/input/prwp-gliner-outputs/output/*.json")

In [None]:
len(fnames)

In [None]:
from tqdm.auto import tqdm

In [None]:
res_df = pd.DataFrame()
for fname in tqdm(fnames, desc="processing dfs"):
    res_read = pd.read_json(fname)
    if res_read.shape[0] == 0:
        continue
    res_df = pd.concat([res_df, res_read[['text', 'dataset']]], axis=0)

In [None]:
records = res_df.to_dict(orient='records')

# Flatten: each relation becomes one row, and we pull in the dataset fields as "meta"
flat = pd.json_normalize(
    records,
    # record_path='relations',
    meta=[
      ['dataset','start'],
      ['dataset','end'],
      ['dataset','text'],
      ['dataset','label'],
      ['dataset','score']
    ],
)

flat = flat.rename(columns={
    'dataset.start': 'start',
    'dataset.end':   'end',
    'dataset.text':  'dataset',
    'dataset.label': 'ds_label',
    'dataset.score': 'ds_score',
    'score':         'rel_score'   # this was the relation’s confidence
})

In [None]:
from tqdm.auto import tqdm

In [None]:
res_df = pd.DataFrame()
for fname in tqdm(fnames, desc="processing dfs"):
    res_read = pd.read_json(fname)
    if res_read.shape[0] == 0:
        continue
    res_df = pd.concat([res_df, res_read[['dataset', 'relations']]], axis=0)

In [None]:
res_df

In [None]:
import pandas as pd

records = res_df.to_dict(orient='records')

# Flatten
flat = pd.json_normalize(
    records,
    record_path='relations',
    meta=[
      ['dataset','start'],
      ['dataset','end'],
      ['dataset','text'],
      ['dataset','label'],
      ['dataset','score']
    ],
)

# Rename columns for clarity
flat = flat.rename(columns={
    'dataset.start': 'start',
    'dataset.end':   'end',
    'dataset.text':  'ds_text',
    'dataset.label': 'ds_label',
    'dataset.score': 'ds_score',
    'score':         'rel_score'   
})
flat.head()

In [None]:
# Deduplicate to one best relation per (dataset, relation) pair
flat = (
    flat
      .sort_values('rel_score', ascending=False)
      .drop_duplicates(subset=['ds_text','relation'])
      .reset_index(drop=True)
)

# Pivot
meta = (
    flat
      .pivot(index=['ds_text','ds_label','ds_score','start','end'],
             columns='relation',
             values='target')
      .reset_index()
)

In [None]:
meta.loc[meta['publisher'].isna()]

In [None]:
meta

In [None]:
from rapidfuzz import process, fuzz
import re

def make_canon_map(strings, score_cutoff=85):
    clusters = {}
    for s in strings:
        match = process.extractOne(s, clusters.keys(),
                                   scorer=fuzz.token_sort_ratio,
                                   score_cutoff=score_cutoff)
        if match:
            clusters[match[0]].append(s)
        else:
            clusters[s] = [s]
    return {v: k for k, vs in clusters.items() for v in vs}, clusters

# standardize_years
def standardize_year_range(s):
    if pd.isna(s):
        return None
    yrs = [int(m.group(1))
           for m in re.finditer(r'\b((?:19|20)\d{2})\b', s)]
    if len(yrs) == 1:
        return f"{yrs[0]}"
    if len(yrs) >= 2:
        y1, y2 = sorted(yrs[:2])
        return f"{y1}-{y2}"
    try:
        y = parse(s, fuzzy=True).year
        return f"{y}"
    except:
        return None

meta['year_range'] = meta['years'].apply(standardize_year_range)

# build maps from the pivoted columns
pub_map,  pub_clusters  = make_canon_map(meta['publisher'].dropna().unique(),  score_cutoff=80)
geo_map,  geo_clusters  = make_canon_map(meta['geography'].dropna().unique(),  score_cutoff=80)
abbr_map, abbr_clusters = make_canon_map(meta['abbreviation'].dropna().unique(), score_cutoff=80)
year_map,      year_clusters  = make_canon_map(meta['year_range'].dropna().unique(),  score_cutoff=100)
ds_map, ds_clusters = make_canon_map(meta['ds_text'].dropna().unique(), score_cutoff=80)
desc_map, desc_clusters = make_canon_map(
    meta['description'].dropna().unique().tolist(),
    score_cutoff=80
)

meta['description_canon'] = meta['description'].map(desc_map)
meta['publisher_canon']    = meta['publisher'].map(pub_map)
meta['geography_canon']    = meta['geography'].map(geo_map)
meta['abbreviation_canon'] = meta['abbreviation'].map(abbr_map)
meta['year_canon'] = meta['year_range'].map(year_map)
meta['ds_canon'] = meta['ds_text'].map(ds_map)

In [None]:
reagg = (
    meta
      .sort_values('ds_score', ascending=False)   # keep best‐scoring rows if duplicates
      .drop_duplicates(
         subset=['ds_canon','publisher_canon','geography_canon','abbreviation_canon', 'year_canon','description_canon']
      )
      .reset_index(drop=True)
)

In [None]:
reagg

In [None]:
import plotly.express as px

counts_all = (
    reagg
    .groupby([
        'publisher_canon',
        'geography_canon',
        'year_canon',
        'ds_canon',
        'description_canon',
    ])
    .agg(
        size       = ('ds_score','count'),
        avg_score  = ('ds_score','mean')
    )
    .reset_index()
)

fig = px.treemap(
  counts_all,
    path=[
        'publisher_canon',
        'geography_canon',
        'year_canon',
        'ds_canon',
        'description_canon',
    ],
  values='size', color='avg_score',
color_continuous_scale='Viridis',
title='Drill‐down: Publisher → Geography → Year → Dataset → Description'
)
fig.show()

# Top Datasets by Geography

In [None]:
import textwrap
import plotly.graph_objects as go

def plot_top_dropdown(
    df,
    group_col: str,
    item_col: str,
    top_n: int = 10,
    menu_x: float = 0.99,
    menu_y: float = 0.99,
    margin: dict = None,
    wrap_width: int = 20,
):
    """
    Creates an interactive horizontal‐bar chart with a dropdown to switch groups.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns `group_col` and `item_col`.
    group_col : str
        Column to group by (e.g. 'year_canon').
    item_col : str
        Column of items to count (e.g. 'ds_canon').
    top_n : int
        Number of top items per group.
    menu_x, menu_y : float
        Position of the dropdown (0−1 coordinates).
    margin : dict or None
        Plot margins; defaults to {'l':300,'r':50,'t':50,'b':50}.
    wrap_width : int
        Max chars before wrapping dropdown labels.
    """
    if margin is None:
        margin = dict(l=300, r=50, t=50, b=50)

    def wrap_label(label):
        return "<br>".join(textwrap.wrap(str(label), wrap_width))

    # 1) count occurrences
    counts = (
        df
        .groupby([group_col, item_col])
        .size()
        .reset_index(name="count")
    )

    # 2) sort groups by total
    group_totals = counts.groupby(group_col)["count"].sum()
    groups_sorted = group_totals.sort_values(ascending=False).index.tolist()

    # 3) create one trace per group
    traces = []
    for grp in groups_sorted:
        subset = (
            counts[counts[group_col] == grp]
            .nlargest(top_n, "count")
            .sort_values("count", ascending=True)
        )
        traces.append(go.Bar(
            x=subset["count"],
            y=subset[item_col],
            orientation="h",
            name=str(grp),
            visible=(grp == groups_sorted[0])
        ))

    # 4) build dropdown buttons
    buttons = []
    for i, grp in enumerate(groups_sorted):
        visible = [j == i for j in range(len(groups_sorted))]
        buttons.append(dict(
            label=wrap_label(grp),
            method="update",
            args=[
                {"visible": visible},
                {
                    "title": f"Top Datasets per {group_col.replace('_canon','')}",
                    "margin": margin
                }
            ]
        ))

    # 5) assemble figure
    fig = go.Figure(data=traces)
    fig.update_layout(
        updatemenus=[dict(
            active=0,
            buttons=buttons,
            x=menu_x, xanchor="right",
            y=menu_y, yanchor="top",
            direction="down",
        )],
        title=f"Top Datasets per {group_col.replace('_canon','')}",
        xaxis_title="Count",
        yaxis_title=item_col.replace("_", " ").title(),
        showlegend=False,
        margin=margin
    )
    fig.show()

In [None]:
# Top 10 datasets by year
plot_top_dropdown(reagg, "geography_canon", "ds_canon")

# Top Datasets by Publisher

In [None]:
plot_top_dropdown(reagg, "publisher_canon", "ds_canon")

# Top Datasets by Year

In [None]:
plot_top_dropdown(reagg, "year_canon", "ds_canon")