In [54]:
import os
import pandas as pd

In [55]:
MNT_PATH = os.getenv('MNT_PATH') or ''
trans_path = os.path.join(MNT_PATH, "complete_data", "real_and_fake_w_summary.csv")
baseline_path = "/home/wake/projects/proposal/local_llm/evaluation/baseline_long/results/calm3_hf/baseline_strict_results.csv"
summary_text_path = trans_path
summary_pred_path = "/home/wake/projects/proposal/local_llm/evaluation/summary_long/results/calm3_hf/summary_results.csv"
rag_path = "/home/wake/mnt/results_run_japanese/collapsed_strict_generation_inputs.csv"
screening_text_path = os.path.join(MNT_PATH, 'screening', 'screening_results_long.json')
screening_pred_path = os.path.join(MNT_PATH, "screening", "final_classification_details_long.csv")

In [56]:
trans_df = pd.read_csv(trans_path, dtype=str, encoding="utf-8")
baseline_df = pd.read_csv(baseline_path, dtype=str, encoding="utf-8")
summary_text_df = pd.read_csv(summary_text_path, dtype=str, encoding="utf-8")
summary_pred_df = pd.read_csv(summary_pred_path, dtype=str, encoding="utf-8")
rag_df = pd.read_csv(rag_path, dtype=str, encoding="utf-8")
screening_text_df = pd.read_json(screening_text_path, encoding="utf-8")
screening_pred_df = pd.read_csv(screening_pred_path, dtype=str, encoding="utf-8")

In [57]:
trans_df = trans_df[["video_id", "transcript"]]
baseline_df = baseline_df[["video_id", "true_label", "prediction"]]
summary_text_df = summary_text_df[["video_id", "summary"]]
summary_pred_df = summary_pred_df[["video_id", "true_label", "prediction"]]
rag_df = rag_df[["video_id", "label", "pred", "context_text"]]
rag_text_df = rag_df[["video_id", "context_text"]]
screening_text_df = screening_text_df[["video_id", "candidates"]]
screening_pred_df = screening_pred_df[["video_id", "ground_truth", "prediction"]]

In [58]:
summary_df = summary_text_df.merge(summary_pred_df, on="video_id", how="inner").merge(trans_df, on="video_id", how="inner")
rag_df = rag_df.merge(trans_df, on="video_id", how="inner")
screening_df = screening_text_df.merge(screening_pred_df, on="video_id", how="inner").merge(trans_df, on="video_id", how="inner")

In [59]:
print(len(baseline_df))
print(len(summary_df))
print(len(rag_df))
print(len(screening_df))

74
74
74
74


In [60]:
baseline_condition = (baseline_df["true_label"] == "fake") & (baseline_df["prediction"] == "fake")
target_video_ids = baseline_df.loc[baseline_condition, "video_id"]

In [61]:
summary_condition = (
    (summary_df["true_label"] == "fake") &
    (summary_df["prediction"] == "real") &
    (summary_df["video_id"].isin(target_video_ids))
)

rag_condition = (
    (rag_df["label"] == "fake") &
    (rag_df["pred"] == "real") &
    (rag_df["video_id"].isin(target_video_ids))
)

screening_condition = (
    (screening_df["ground_truth"] == "fake") &
    (screening_df["prediction"] == "real") &
    (screening_df["video_id"].isin(target_video_ids))
)

In [62]:
text_dir = os.path.join(MNT_PATH, "text_length")
os.makedirs(text_dir, exist_ok=True)
baseline_text = os.path.join(text_dir, "baseline.csv")
summary_text = os.path.join(text_dir, "summary.csv")
rag_text = os.path.join(text_dir, "rag.csv")
screening_text = os.path.join(text_dir, "screening.csv")

trans_df.to_csv(baseline_text, index=False, encoding="utf-8")
summary_text_df.to_csv(summary_text, index=False, encoding="utf-8")
rag_text_df.to_csv(rag_text, index=False, encoding="utf-8")
screening_text_df.to_csv(screening_text, index=False, encoding="utf-8")

In [63]:
summary_result_df = summary_df.loc[summary_condition, ["video_id", "summary", "transcript"]]
rag_result_df = rag_df.loc[rag_condition, ["video_id", "context_text", "transcript"]]
screening_result_df = screening_df.loc[screening_condition, ["video_id", "candidates", "transcript"]]

In [64]:
summary_result_df = summary_result_df.rename(columns={"summary": "text"})
rag_result_df = rag_result_df.rename(columns={"context_text": "text"})
screening_result_df = screening_result_df.rename(columns={"candidates": "text"})

In [65]:
output_dir = os.path.join(MNT_PATH, "linguistic_features")
os.makedirs(output_dir, exist_ok=True)
fn_summary_output = os.path.join(output_dir, "missed_by_summary_fn.csv")
summary_result_df.to_csv(fn_summary_output, index=False, encoding="utf-8")
fn_rag_output = os.path.join(output_dir, "missed_by_rag_fn.csv")
rag_result_df.to_csv(fn_rag_output, index=False, encoding="utf-8")
fn_screening_output = os.path.join(output_dir, "missed_by_screening_fn.csv")
screening_result_df.to_csv(fn_screening_output, index=False, encoding="utf-8")

In [66]:
summary_condition = (
    (summary_df["true_label"] == "fake") &
    (summary_df["prediction"] == "fake") &
    (summary_df["video_id"].isin(target_video_ids))
)

rag_condition = (
    (rag_df["label"] == "fake") &
    (rag_df["pred"] == "fake") &
    (rag_df["video_id"].isin(target_video_ids))
)

screening_condition = (
    (screening_df["ground_truth"] == "fake") &
    (screening_df["prediction"] == "fake") &
    (screening_df["video_id"].isin(target_video_ids))
)

In [67]:
summary_result_df = summary_df.loc[summary_condition, ["video_id", "summary", "transcript"]]
rag_result_df = rag_df.loc[rag_condition, ["video_id", "context_text", "transcript"]]
screening_result_df = screening_df.loc[screening_condition, ["video_id", "candidates", "transcript"]]

In [68]:
summary_result_df = summary_result_df.rename(columns={"summary": "text"})
rag_result_df = rag_result_df.rename(columns={"context_text": "text"})
screening_result_df = screening_result_df.rename(columns={"candidates": "text"})

In [69]:
fn_summary_output = os.path.join(output_dir, "succeeded_by_summary_tp.csv")
summary_result_df.to_csv(fn_summary_output, index=False, encoding="utf-8")
fn_rag_output = os.path.join(output_dir, "succeeded_by_rag_tp.csv")
rag_result_df.to_csv(fn_rag_output, index=False, encoding="utf-8")
fn_screening_output = os.path.join(output_dir, "succeeded_by_screening_tp.csv")
screening_result_df.to_csv(fn_screening_output, index=False, encoding="utf-8")