In [1]:
import pandas as pd
from pathlib import Path

# Input file paths
nsc_files = [
    "./result_dup_nsc.csv",
    "./result_new_nsc.csv",
]

non_nsc_files = [
    "./result_dup.csv",
    "./result_new.csv",
]

# Output file paths
out_nsc = Path("./concat_nsc.csv")
out_non_nsc = Path("./concat_non_nsc.csv")

def load_csv_safe(path):
    # Use utf-8-sig to be consistent with user's previous files
    return pd.read_csv(path, encoding="utf-8-sig", low_memory=False)

# Load inputs
dfs_nsc = [load_csv_safe(p) for p in nsc_files]
dfs_non_nsc = [load_csv_safe(p) for p in non_nsc_files]

# Concatenate
concat_nsc = pd.concat(dfs_nsc, ignore_index=True, sort=False)
concat_non_nsc = pd.concat(dfs_non_nsc, ignore_index=True, sort=False)

# Save results
concat_nsc.to_csv(out_nsc, index=False, encoding="utf-8-sig")
concat_non_nsc.to_csv(out_non_nsc, index=False, encoding="utf-8-sig")


In [None]:
import json
import re
import pandas as pd

# =========================
# 0) 파일 경로
# =========================
non_nsc_path = "./concat_non_nsc.csv"
origin_path  = "./all_origin_updated.csv"

# =========================
# 1) 유틸: 컬럼 선택기
# =========================
def pick_column(df, exact_priority=None, contains_priority=None, reason=""):
    """
    df.columns 중에서 우선순위에 맞춰 컬럼 하나를 고른다.
    - exact_priority: 소문자 동일비교 우선순위 리스트
    - contains_priority: '포함' 우선순위 리스트(소문자)
    못 찾으면 None
    """
    cols_lower = {c.lower(): c for c in df.columns}
    # exact match
    if exact_priority:
        for cand in exact_priority:
            if cand in cols_lower:
                print(f"[Select:{reason}] exact -> '{cols_lower[cand]}'")
                return cols_lower[cand]
    # contains match
    if contains_priority:
        for partial in contains_priority:
            for c in df.columns:
                if partial in c.lower():
                    print(f"[Select:{reason}] contains('{partial}') -> '{c}'")
                    return c
    return None

def norm_str(x):
    return None if pd.isna(x) else str(x).strip()

# =========================
# 2) 로드 & 기본 정리
# =========================
non_nsc = pd.read_csv(non_nsc_path, encoding="utf-8-sig", low_memory=False)
origin  = pd.read_csv(origin_path,  encoding="utf-8-sig", low_memory=False)

for df in (non_nsc, origin):
    if "ticket_id_hashed" in df.columns:
        df["ticket_id_hashed"] = df["ticket_id_hashed"].astype(str).str.strip()

print("[Info] origin columns:", list(origin.columns))

# =========================
# 3) origin의 '키워드' 컬럼 자동 탐지
#    - 우선순위: exact > contains('keyword') > contains('answer')
# =========================
keyword_exact_priority    = [
    "keyword", "keywords", "keyword_from_origin", "answer_keyword_main",
    "answer_keyword", "final_keyword"
]
keyword_contains_priority = ["keyword", "answer"]

ORIGIN_KEYWORD_COL = pick_column(
    origin,
    exact_priority=keyword_exact_priority,
    contains_priority=keyword_contains_priority,
    reason="keyword"
)

if ORIGIN_KEYWORD_COL is None:
    raise KeyError(
        "키워드 컬럼을 찾지 못했습니다. origin에 다음 중 하나의 컬럼이 필요합니다: "
        f"exact={keyword_exact_priority}, contains any of {keyword_contains_priority}. "
        f"현재 컬럼: {list(origin.columns)}"
    )

# =========================
# 4) JOIN 키 결정: 기본은 ticket_id_hashed
#    - 없으면 ticket_id로 폴백 (non_nsc와 origin 둘 다 있어야 함)
# =========================
JOIN_KEY = None
if "ticket_id_hashed" in non_nsc.columns and "ticket_id_hashed" in origin.columns:
    JOIN_KEY = "ticket_id_hashed"
elif "ticket_id" in non_nsc.columns and "ticket_id" in origin.columns:
    JOIN_KEY = "ticket_id"
else:
    # 그래도 진행은 가능(나중에 final_top2 매핑은 dict로 처리)하지만
    # 병합은 스킵하고 경고 출력
    print("[Warn] 공통 JOIN 키가 없어 병합을 스킵합니다. "
          "final_top2 기반 sim_keyword_* 생성만 진행합니다.")

# =========================
# 5) non_nsc ← origin 키워드 붙이기(LEFT JOIN)
# =========================
merged = non_nsc.copy()

if JOIN_KEY is not None:
    origin_kw = (
        origin.loc[:, [JOIN_KEY, ORIGIN_KEYWORD_COL]]
              .dropna(subset=[JOIN_KEY])
              .drop_duplicates(subset=[JOIN_KEY], keep="first")
              .rename(columns={ORIGIN_KEYWORD_COL: "Keyword_from_origin"})
    )
    merged = merged.merge(origin_kw, on=JOIN_KEY, how="left")
    matched = merged["Keyword_from_origin"].notna().sum()
    print(f"[Join] '{JOIN_KEY}'로 Keyword_from_origin 부착 → {matched}/{len(merged)} rows matched")
else:
    print("[Join] 스킵(공통 키 없음)")

# =========================
# 6) origin 키워드 조회 딕셔너리(해시/플레인 모두)
# =========================
kw_by_hashed = {}
if "ticket_id_hashed" in origin.columns:
    tmp = origin[[c for c in ["ticket_id_hashed", ORIGIN_KEYWORD_COL] if c in origin.columns]]
    tmp = tmp.dropna(subset=["ticket_id_hashed"])
    for k, v in zip(tmp["ticket_id_hashed"].map(norm_str), tmp[ORIGIN_KEYWORD_COL]):
        if k and k not in kw_by_hashed:
            kw_by_hashed[k] = v

kw_by_plain = {}
if "ticket_id" in origin.columns:
    tmp = origin[[c for c in ["ticket_id", ORIGIN_KEYWORD_COL] if c in origin.columns]]
    tmp = tmp.dropna(subset=["ticket_id"])
    for k, v in zip(tmp["ticket_id"].map(norm_str), tmp[ORIGIN_KEYWORD_COL]):
        if k and k not in kw_by_plain:
            kw_by_plain[k] = v

# =========================
# 7) final_top2 파서 & id→keyword 매핑
# =========================
def parse_final_top2(value):
    """value에서 [id1, id2]를 뽑아낸다."""
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return [None, None]

    obj = value
    if isinstance(value, str):
        s = value.strip()
        try:
            obj = json.loads(s)
        except Exception:
            obj = s

    if isinstance(obj, list):
        items = obj
    elif isinstance(obj, dict):
        # 흔한 키들 탐색
        for k in ("final_top2", "top2", "final", "result"):
            if k in obj:
                sub = obj[k]
                if isinstance(sub, dict):
                    if "top2" in sub:
                        items = sub["top2"]
                    elif "final_top2" in sub:
                        items = sub["final_top2"]
                    else:
                        items = sub
                else:
                    items = sub
                break
        else:
            items = obj
    else:
        tokens = re.findall(r"[A-Za-z0-9]{6,}", str(obj))
        ids = tokens[:2]
        while len(ids) < 2:
            ids.append(None)
        return ids

    ids = []
    if isinstance(items, list):
        for it in items:
            if isinstance(it, dict):
                for key in ("ticket_id", "ticket_id_hashed", "id"):
                    if key in it and it[key] is not None:
                        ids.append(norm_str(it[key]))
                        break
            else:
                ids.append(norm_str(it))
            if len(ids) >= 2:
                break
    elif isinstance(items, dict):
        for key in ("ticket_id", "ticket_id_hashed", "id"):
            if key in items and items[key] is not None:
                ids.append(norm_str(items[key]))
        if len(ids) < 2:
            for v in items.values():
                if isinstance(v, list):
                    for it in v:
                        if isinstance(it, dict):
                            for key in ("ticket_id", "ticket_id_hashed", "id"):
                                if key in it and it[key] is not None:
                                    ids.append(norm_str(it[key]))
                                    break
                        else:
                            ids.append(norm_str(it))
                        if len(ids) >= 2:
                            break
                if len(ids) >= 2:
                    break
    else:
        tokens = re.findall(r"[A-Za-z0-9]{6,}", str(items))
        ids = tokens[:2]

    while len(ids) < 2:
        ids.append(None)
    return ids[:2]

def id_to_keyword(tid):
    if not tid:
        return None
    # hashed 우선
    if tid in kw_by_hashed:
        return kw_by_hashed[tid]
    # plain 보조
    if tid in kw_by_plain:
        return kw_by_plain[tid]
    return None

def top2_to_keywords(val):
    id1, id2 = parse_final_top2(val)
    return pd.Series([id_to_keyword(id1), id_to_keyword(id2)], index=["sim_keyword_1", "sim_keyword_2"])

# =========================
# 8) sim_keyword_1/2 생성
#    - 우선순위: 'final_top2' 컬럼 > 'final_result_raw' 컬럼
# =========================
if "final_top2" in merged.columns:
    merged[["sim_keyword_1", "sim_keyword_2"]] = merged["final_top2"].apply(top2_to_keywords)
elif "final_result_raw" in merged.columns:
    merged[["sim_keyword_1", "sim_keyword_2"]] = merged["final_result_raw"].apply(top2_to_keywords)
else:
    print("[Warn] 'final_top2'도 'final_result_raw'도 없어 sim_keyword_* 생성 불가")

# =========================
# 9) 키워드 NaN 감사(중간 출력 + CSV 저장)
# =========================
keyword_cols = [c for c in merged.columns if re.search(r'keyword', c, flags=re.I)]
if not keyword_cols:
    print("[Info] 'keyword' 포함 컬럼 없음")
else:
    print(f"[Check] 키워드 관련 컬럼: {keyword_cols}")

    stats = pd.DataFrame({
        "non_null": merged[keyword_cols].notna().sum(),
        "null": merged[keyword_cols].isna().sum()
    })
    stats["null_rate(%)"] = (stats["null"] / len(merged) * 100).round(2)
    print("\n[Keyword NaN 통계]\n", stats)

    merged["is_missing_any_keyword"] = merged[keyword_cols].isna().any(axis=1)

    def _missing_cols(row):
        miss = [col for col in keyword_cols if pd.isna(row[col])]
        return "|".join(miss) if miss else None

    merged["missing_keyword_cols"] = merged.apply(_missing_cols, axis=1)

    any_missing = merged["is_missing_any_keyword"].sum()
    print(f"\n[Rows with any keyword NaN] {any_missing} / {len(merged)} "
          f"({any_missing/len(merged):.2%})")

    if any_missing:
        cols_to_show = [c for c in ["ticket_id_hashed", "ticket_id"] if c in merged.columns] + keyword_cols
        print("\n[미싱 샘플 상위 10건]")
        print(merged.loc[merged["is_missing_any_keyword"], cols_to_show]
                    .head(10)
                    .to_string(index=False))

    audit_path = "./keyword_nan_audit.csv"
    merged.loc[merged["is_missing_any_keyword"],
               [c for c in ["ticket_id_hashed", "ticket_id", "missing_keyword_cols"] if c in merged.columns]
               + keyword_cols] \
          .to_csv(audit_path, index=False, encoding="utf-8-sig")
    print(f"\n[Saved audit] {audit_path}")

# =========================
# 10) 최종 저장
# =========================
out_path2 = "./concat_non_nsc_plus_keyword_with_top2_keywords.csv"
merged.to_csv(out_path2, index=False, encoding="utf-8-sig")
print(f"Saved -> {out_path2}")

[Info] origin columns: ['ticket_id_hashed', 'Privacy_Detect_Col', 'components', 'keyword', '사업부', '지역', 'language', '대분류', '중분류', '소분류', 'beforechange', 'afterchange', 'labels', 'environment', 'created_date_yyyymm', 'thinQmodel', 'salesmodel', 'generated_summary', 'generated_translation', 'merge_key', 'generated_response']
[Select:keyword] exact -> 'keyword'
[Join] 'ticket_id_hashed'로 Keyword_from_origin 부착 → 200/200 rows matched
[Warn] 'final_top2'도 'final_result_raw'도 없어 sim_keyword_* 생성 불가
[Check] 키워드 관련 컬럼: ['sim_keyword', 'final_keyword', 'Keyword_from_origin']

[Keyword NaN 통계]
                      null  null_rate(%)
sim_keyword            10           5.0
final_keyword           0           0.0
Keyword_from_origin     0           0.0

[Rows with any keyword NaN] 10 / 200 (5.00%)

[미싱 샘플 상위 10건]
ticket_id_hashed sim_keyword final_keyword Keyword_from_origin
         2597c3b         NaN        세탁건조통합              원격기능제어
         565d6e1         NaN          건조효율               계절별

In [None]:
# Fix: extract ticket_ids from final_result.final_top2 and map to keywords from all_origin_updated
import pandas as pd
import json, re
import numpy as np
from itertools import islice

nsc_path = "./concat_nsc.csv"
origin_path = "./all_origin_updated.csv"

nsc = pd.read_csv(nsc_path, encoding="utf-8-sig", low_memory=False)
origin = pd.read_csv(origin_path, encoding="utf-8-sig", low_memory=False)

def norm(s):
    if pd.isna(s):
        return None
    return str(s).strip()

# --- Build keyword lookup (hashed + plain) ---
kw_col = None
for c in origin.columns:
    if c.lower() == "keyword":
        kw_col = c; break
if kw_col is None:
    for c in origin.columns:
        if "keyword" in c.lower():
            kw_col = c; break
if kw_col is None:
    raise KeyError(f"Keyword column not found in origin: {list(origin.columns)}")

kw_by_hashed = {}
if "ticket_id_hashed" in origin.columns:
    tmp = origin[["ticket_id_hashed", kw_col]].dropna(subset=["ticket_id_hashed"])
    for k, v in zip(tmp["ticket_id_hashed"].map(norm), tmp[kw_col]):
        if k and k not in kw_by_hashed:
            kw_by_hashed[k] = v

kw_by_plain = {}
if "ticket_id" in origin.columns:
    tmp = origin[["ticket_id", kw_col]].dropna(subset=["ticket_id"])
    for k, v in zip(tmp["ticket_id"].map(norm), tmp[kw_col]):
        if k and k not in kw_by_plain:
            kw_by_plain[k] = v

def id_to_keyword(tid):
    if not tid:
        return None
    # hashed first
    if tid in kw_by_hashed:
        return kw_by_hashed[tid]
    if tid in kw_by_plain:
        return kw_by_plain[tid]
    return None

# --- Robust extractor from final_result ---
def safe_json_loads(val):
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return None
    if isinstance(val, (dict, list)):
        return val
    s = str(val).strip()
    if not s:
        return None
    try:
        return json.loads(s)
    except Exception:
        return s

def extract_first_token(value_str):
    if value_str is None:
        return None
    toks = re.findall(r"[A-Za-z0-9]{6,}", str(value_str))
    return toks[0] if toks else None

def extract_id_from_item(it):
    if it is None:
        return None
    if isinstance(it, dict):
        # keys that might contain the id
        for k in ("ticket_id_hashed", "ticket_id", "id", "doc_id", "row_id", "source_id", "hash", "uuid"):
            if k in it and it[k] is not None:
                return extract_first_token(it[k])
        # nested metadata-like
        for k in ("metadata", "source", "context"):
            if k in it and it[k] is not None:
                v = it[k]
                if isinstance(v, dict):
                    for kk in ("ticket_id_hashed", "ticket_id", "id"):
                        if kk in v and v[kk] is not None:
                            return extract_first_token(v[kk])
                else:
                    return extract_first_token(v)
        # fallback: any 6+ alnum token inside dict
        return extract_first_token(json.dumps(it, ensure_ascii=False))
    else:
        # string-like item
        return extract_first_token(it)

def parse_final_top2_from_final_result(val):
    obj = safe_json_loads(val)
    if obj is None:
        return [None, None]

    items = None
    if isinstance(obj, dict):
        # common containers, especially "final_top2"
        for key in ("final_top2", "top2", "candidates", "items", "docs", "matches", "result"):
            if key in obj and isinstance(obj[key], list) and len(obj[key]) > 0:
                items = obj[key]
                break
        if items is None:
            # sometimes "final_result" is dict with nested dicts/lists
            for v in obj.values():
                if isinstance(v, list) and len(v) > 0:
                    items = v
                    break
            if items is None:
                # fall back: scan tokens
                token = extract_first_token(json.dumps(obj, ensure_ascii=False))
                return [token, None]
    elif isinstance(obj, list):
        items = obj
    else:
        # string → tokens
        token = extract_first_token(obj)
        return [token, None]

    # sort by score if available
    def score_of(it):
        if not isinstance(it, dict):
            return None
        for k in ("score", "similarity", "cosine", "sim", "confidence"):
            if k in it:
                try:
                    return float(it[k])
                except Exception:
                    pass
        return None

    ordered = items
    if items and isinstance(items[0], dict):
        with_sc, without_sc = [], []
        for it in items:
            sc = score_of(it)
            (with_sc if sc is not None else without_sc).append((sc, it))
        if with_sc:
            with_sc.sort(key=lambda x: x[0], reverse=True)
            ordered = [it for sc, it in with_sc] + [it for sc, it in without_sc]

    ids = []
    for it in ordered:
        ids.append(extract_id_from_item(it))
        if len(ids) >= 2:
            break
    while len(ids) < 2:
        ids.append(None)
    return ids[:2]

# --- Apply to NSC df ---
if "final_result" not in nsc.columns:
    raise KeyError("final_result column not found in concat_nsc.csv")

ids_df = nsc["final_result"].apply(parse_final_top2_from_final_result).apply(pd.Series)
ids_df.columns = ["top2_id_1", "top2_id_2"]

nsc_aug = pd.concat([nsc, ids_df], axis=1)
nsc_aug["sim_keyword_1"] = nsc_aug["top2_id_1"].map(id_to_keyword)
nsc_aug["sim_keyword_2"] = nsc_aug["top2_id_2"].map(id_to_keyword)

# attach Keyword_from_origin via join key if present
if "ticket_id_hashed" in nsc_aug.columns and "ticket_id_hashed" in origin.columns:
    origin_simple = origin[["ticket_id_hashed", kw_col]].drop_duplicates("ticket_id_hashed").rename(columns={kw_col: "Keyword_from_origin"})
    nsc_aug = nsc_aug.merge(origin_simple, on="ticket_id_hashed", how="left")

# Save
out_path = "./concat_nsc_plus_keyword_with_top2_keywords_fixed.csv"
nsc_aug.to_csv(out_path, index=False, encoding="utf-8-sig")

# Summaries to help debug
summary = {
    "rows": len(nsc_aug),
    "non_null_sim_keyword_1": int(nsc_aug["sim_keyword_1"].notna().sum()),
    "non_null_sim_keyword_2": int(nsc_aug["sim_keyword_2"].notna().sum())
}


[Keyword NaN 통계]
                      non_null  null  null_rate(%)
sim_keyword               190    10           5.0
final_keyword             200     0           0.0
Keyword_from_origin       200     0           0.0
sim_keyword_1               0   200         100.0
sim_keyword_2               0   200         100.0
Saved -> ./concat_nsc_plus_keyword_with_top2_keywords.csv


  ).applymap(id_to_keyword)


In [None]:
# Threshold-based match analysis for NSC & Non-NSC using sim_score
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import caas_jupyter_tools

# ---------- Helper functions ----------
def norm(s):
    if pd.isna(s): return None
    return str(s).strip().lower()

def pick_col(df, target_lower, contains=False):
    # exact match first
    for c in df.columns:
        if c.lower() == target_lower:
            return c
    if contains:
        for c in df.columns:
            if target_lower in c.lower():
                return c
    return None

def ensure_numeric(series):
    return pd.to_numeric(series, errors="coerce")

def compute_stats(df, thresholds_decimals):
    # column picks
    col_origin = pick_col(df, "keyword_from_origin") or pick_col(df, "keyword_from_origin", contains=True)
    col_sim1   = pick_col(df, "sim_keyword_1") or pick_col(df, "sim_keyword_1", contains=True)
    col_sim2   = pick_col(df, "sim_keyword_2") or pick_col(df, "sim_keyword_2", contains=True)
    col_simscore = pick_col(df, "sim_score") or pick_col(df, "simscore", contains=True) or pick_col(df, "similarity", contains=True)
    if col_origin is None or col_sim1 is None or col_sim2 is None:
        raise KeyError("Required keyword columns not found in dataframe.")
    if col_simscore is None:
        raise KeyError("sim_score (or similar) column not found in dataframe.")
    
    # normalize strings
    df = df.copy()
    df["_origin_n"] = df[col_origin].apply(norm)
    df["_sim1_n"]   = df[col_sim1].apply(norm)
    df["_sim2_n"]   = df[col_sim2].apply(norm)
    df["_sim_score"] = ensure_numeric(df[col_simscore])
    
    # detect score scale (0-1 vs 0-100)
    max_score = df["_sim_score"].max(skipna=True)
    scale = 100.0 if (pd.notna(max_score) and max_score > 1.5) else 1.0
    thresholds = [t * scale for t in thresholds_decimals]
    
    # precompute exact flags
    df["_match1"] = (df["_origin_n"].notna() & df["_sim1_n"].notna() & (df["_origin_n"] == df["_sim1_n"]))
    df["_match2"] = (df["_origin_n"].notna() & df["_sim2_n"].notna() & (df["_origin_n"] == df["_sim2_n"]))
    
    rows = []
    for t_raw, t_eff in zip(thresholds_decimals, thresholds):
        sub = df[df["_sim_score"] >= t_eff]
        total = len(sub)
        if total == 0:
            rows.append({
                "threshold": f">= {t_eff:g}" if scale==100 else f">= {t_raw:.2f}",
                "total_rows": 0,
                "match_sim1_count": 0, "match_sim1_rate(%)": np.nan,
                "match_sim2_count": 0, "match_sim2_rate(%)": np.nan,
                "sim1_false_sim2_true_count": 0, "sim1_false_sim2_true_rate(%)": np.nan,
            })
            continue
        
        m1 = sub["_match1"].sum()
        m2 = sub["_match2"].sum()
        m1_false_m2_true = ((~sub["_match1"]) & sub["_match2"]).sum()
        
        rows.append({
            "threshold": f">= {int(t_eff)}" if scale==100 else f">= {t_raw:.2f}",
            "total_rows": int(total),
            "match_sim1_count": int(m1), "match_sim1_rate(%)": round(m1/total*100, 2),
            "match_sim2_count": int(m2), "match_sim2_rate(%)": round(m2/total*100, 2),
            "sim1_false_sim2_true_count": int(m1_false_m2_true), "sim1_false_sim2_true_rate(%)": round(m1_false_m2_true/total*100, 2),
        })
    return pd.DataFrame(rows)

# ---------- Load files ----------
non_nsc_path = "./concat_non_nsc_plus_keyword_with_top2_keywords.csv"
nsc_path     = "./concat_nsc_plus_keyword_with_top2_keywords_fixed.csv"

df_non = pd.read_csv(non_nsc_path, encoding="utf-8-sig", low_memory=False)
df_nsc = pd.read_csv(nsc_path,     encoding="utf-8-sig", low_memory=False)

# ---------- Thresholds (decimals) ----------
thresholds_decimals = [0.95, 0.90, 0.85, 0.80, 0.75, 0.70]

# ---------- Compute ----------
stats_non = compute_stats(df_non, thresholds_decimals)
stats_nsc = compute_stats(df_nsc, thresholds_decimals)

# save & display
out_non = "./match_stats_non_nsc_by_threshold.csv"
out_nsc = "./match_stats_nsc_by_threshold.csv"
stats_non.to_csv(out_non, index=False, encoding="utf-8-sig")
stats_nsc.to_csv(out_nsc, index=False, encoding="utf-8-sig")

caas_jupyter_tools.display_dataframe_to_user("Non-NSC — Threshold match stats (sim_score)", stats_non)
caas_jupyter_tools.display_dataframe_to_user("NSC — Threshold match stats (sim_score)", stats_nsc)

out_non, out_nsc


 Threshold ≥  Rows  kw_vs_answer_exact (cnt)  kw_vs_answer_exact (%)  kw_vs_final_exact (cnt)  kw_vs_final_exact (%)  kw_vs_answer_cossim (avg)  kw_vs_final_cossim (avg)  sim_summary_gen_1 (avg)  sim_summary_gen_2 (avg)
          90    10                         9                   90.00                        9                  90.00                      0.923                     0.923                    0.888                      1.0
          85    32                        25                   78.12                       10                  31.25                      0.899                     0.711                    0.854                      1.0
          80    62                        44                   70.97                       10                  16.13                      0.878                     0.650                    0.812                      1.0
          75    82                        58                   70.73                       10                  12.20    

In [4]:
thr_df

Unnamed: 0,Threshold ≥,Rows,kw_vs_answer_exact (cnt),kw_vs_answer_exact (%),kw_vs_final_exact (cnt),kw_vs_final_exact (%),kw_vs_answer_cossim (avg),kw_vs_final_cossim (avg),sim_summary_gen_1 (avg),sim_summary_gen_2 (avg)
0,90,10,9,90.0,9,90.0,0.923,0.923,0.888,1.0
1,85,32,25,78.12,10,31.25,0.899,0.711,0.854,1.0
2,80,62,44,70.97,10,16.13,0.878,0.65,0.812,1.0
3,75,82,58,70.73,10,12.2,0.871,0.641,0.783,1.0
4,70,92,66,71.74,10,10.87,0.876,0.634,0.765,1.0
