In [1]:
%pip install pandas numpy spacy textstat vaderSentiment sentence-transformers scikit-learn detoxify transformers torch
!python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.


    fury (>=0.8.0scikit-learn); extra == 'all'
         ~~~~~~~~^

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\Vrinda\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 3.1/12.8 MB 16.8 MB/s eta 0:00:01
     --------------------- ------------------ 6.8/12.8 MB 16.8 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 17.7 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 17.1 MB/s eta 0:00:00
[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


    fury (>=0.8.0scikit-learn); extra == 'all'
         ~~~~~~~~^

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\Vrinda\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:

import pandas as pd
import numpy as np
import spacy
import textstat
import torch
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy import stats
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from detoxify import Detoxify
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, pipeline
import math
import csv
import glob
import os


In [3]:
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

In [4]:
class ComprehensiveComparator:
    def __init__(self, emfd_path='Datasets/emfd_scoring.csv', clickbait_path='Datasets/clickbait_data.csv'):
        print("Initializing Models... (This may take a moment)")
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except:
            print("   Downloading spacy model...")
            from spacy.cli import download
            download("en_core_web_sm")
            self.nlp = spacy.load("en_core_web_sm")
            
        self.sentiment = SentimentIntensityAnalyzer()
        self.sbert = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        
        try:
            self.nli_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)
        except:
            self.nli_classifier = None
            print("   Warning: NLI model failed to load.")

        self.tox = None
        try:
            self.tox = Detoxify('original')
        except:
            print("   Warning: Detoxify failed to load.")

        try:
            self.gpt2_id = "gpt2"
            self.gpt2_model = GPT2LMHeadModel.from_pretrained(self.gpt2_id)
            self.gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(self.gpt2_id)
        except:
            self.gpt2_model = None

        self._load_emfd(emfd_path)
        self._train_clickbait(clickbait_path)

    def _load_emfd(self, path):
        self.emfd_dict = {}
        if os.path.exists(path):
            try:
                df = pd.read_csv(path)
                self.emfd_dict = df.set_index('word').to_dict('index')
            except: pass

    def _train_clickbait(self, path):
        self.cb_model = None
        if path and os.path.exists(path):
            try:
                df = pd.read_csv(path)
                self.cb_model = make_pipeline(CountVectorizer(), MultinomialNB())
                self.cb_model.fit(df['text'], df['label'])
            except: pass

    def _preprocess(self, text):
        text = re.sub(r'<[^>]+>', '', text) 
        text = re.sub(r'\s+', ' ', text).strip()
        doc = self.nlp(text)
        sents = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 0]
        tokens = [t.text.lower() for t in doc if not t.is_punct and not t.is_space]
        return doc, sents, tokens

    # ==========================================
    # HELPER: Sentence Alignment
    # ==========================================
    def align_sentences(self, sents_o, sents_e):
        """
        Aligns sentences from Doc O to Doc E using SBERT cosine similarity.
        Returns a list of tuples: [(sent_o, sent_e, similarity_score), ...]
        """
        if not sents_o or not sents_e:
            return []
        
        # Encode both sets
        emb_o = self.sbert.encode(sents_o, convert_to_tensor=True)
        emb_e = self.sbert.encode(sents_e, convert_to_tensor=True)
        
        # Compute cosine similarity
        cosine_scores = util.cos_sim(emb_o, emb_e)
        
        # For each sentence in O, find best match in E
        aligned = []
        for i, sent_o in enumerate(sents_o):
            best_idx = torch.argmax(cosine_scores[i]).item()
            score = cosine_scores[i][best_idx].item()
            aligned.append((sent_o, sents_e[best_idx], score))
            
        return aligned

    # ==========================================
    # CORE METRICS (Single Representative)
    # ==========================================
    
    # A) Lexical: TTR
    def get_lexical_metrics(self, doc, sents):
        res = []
        for sent in doc.sents:
            tokens = [t.text.lower() for t in sent if not t.is_punct]
            ttr = len(set(tokens)) / len(tokens) if tokens else 0.0
            res.append(ttr)
        return res

    # B) Readability: FKGL
    def get_readability_metrics(self, sentences):
        res = []
        for s in sentences:
            try:
                val = textstat.flesch_kincaid_grade(s)
                res.append(val)
            except:
                res.append(0.0)
        return res

    # C) Stylometry: Burrows Delta (Distance)
    def get_burrows_delta(self, tokens_o, tokens_e):
        all_t = tokens_o + tokens_e
        if not all_t: return 0.0
        common = [w for w, c in Counter(all_t).most_common(50)]
        
        n_o, n_e = len(tokens_o), len(tokens_e)
        if n_o == 0 or n_e == 0: return 0.0
        
        f_o = np.array([tokens_o.count(w)/n_o for w in common])
        f_e = np.array([tokens_e.count(w)/n_e for w in common])
        
        mean_freq = (f_o + f_e) / 2
        std_freq = np.std([f_o, f_e], axis=0) + 1e-9
        
        z_o = (f_o - mean_freq) / std_freq
        z_e = (f_e - mean_freq) / std_freq
        
        return np.mean(np.abs(z_o - z_e))

    # D) Sentiment: Compound
    def get_sentiment_emotion(self, sentences):
        res = []
        for s in sentences:
            res.append(self.sentiment.polarity_scores(s)['compound'])
        return res

    # E) Sensationalism: Clickbait
    def get_sensationalism(self, sentences):
        res = []
        for s in sentences:
            val = 0.0
            if self.cb_model:
                try: val = self.cb_model.predict_proba([s])[0][1]
                except: pass
            res.append(val)
        return res

    # F) Framing: Moral Score
    def get_framing(self, doc):
        res = []
        for sent in doc.sents:
            tokens = [t.text.lower() for t in sent if not t.is_punct]
            score = 0.0
            for t in tokens:
                if t in self.emfd_dict:
                    vals = self.emfd_dict[t]
                    score += sum([v for k,v in vals.items() if k.endswith('_p')])
            res.append(score / len(tokens) if tokens else 0.0)
        return res

    # G) Toxicity
    def get_toxicity(self, sentences):
        res = []
        for s in sentences:
            val = 0.0
            if self.tox:
                try: val = self.tox.predict(s[:512])['toxicity']
                except: pass
            res.append(val)
        return res

    # H) Topic Shift (Entities) - Helper
    def get_entities(self, doc):
        return [e.text.lower() for e in doc.ents]

    # J) Discourse: Density
    def get_discourse(self, doc):
        markers = {'however', 'therefore', 'thus', 'moreover', 'because', 'since', 'but', 'and', 'so'}
        res = []
        for sent in doc.sents:
            cnt = len([t for t in sent if t.text.lower() in markers])
            res.append(cnt / len(sent) if len(sent) > 0 else 0.0)
        return res

    # K) Factuality - Helper
    def get_factuality(self, text):
        nums = re.findall(r'\d+(?:[.,]\d+)?', text)
        return set(nums)

    # L) LLM-ness: Perplexity
    def get_perplexity(self, sentences):
        res = []
        if not self.gpt2_model: return [0.0]*len(sentences)
        for s in sentences:
            if not s.strip(): 
                res.append(0.0)
                continue
            try:
                enc = self.gpt2_tokenizer(s, return_tensors='pt')
                with torch.no_grad():
                    out = self.gpt2_model(enc.input_ids, labels=enc.input_ids)
                    res.append(math.exp(out.loss.item()))
            except:
                res.append(0.0)
        return res

    # ==========================================
    # AGGREGATION & BOOTSTRAPPING
    # ==========================================
    def _bootstrap_ci(self, scores_o, scores_e, n_boot=1000):
        so = np.array(scores_o, dtype=float)
        se = np.array(scores_e, dtype=float)
        
        if len(so)==0: so = np.array([0.0])
        if len(se)==0: se = np.array([0.0])

        m_o = np.mean(so)
        m_e = np.mean(se)
        diff = m_o - m_e
        
        # Cohen's d
        n1, n2 = len(so), len(se)
        pooled_std = np.sqrt((np.var(so, ddof=1) + np.var(se, ddof=1))/2) + 1e-9
        cohens_d = diff / pooled_std

        # Access random index instead of full shuffle for speed
        try:
            idx_o = np.random.randint(0, n1, (n_boot, n1))
            idx_e = np.random.randint(0, n2, (n_boot, n2))
            
            means_o_boot = np.mean(so[idx_o], axis=1)
            means_e_boot = np.mean(se[idx_e], axis=1)
            boot_diffs = means_o_boot - means_e_boot
            
            ci_low = np.percentile(boot_diffs, 2.5)
            ci_high = np.percentile(boot_diffs, 97.5)
        except:
            ci_low, ci_high = 0.0, 0.0
        
        try:
            _, p_val = stats.ttest_ind(so, se, equal_var=False)
        except: p_val = 1.0

        return {
            "mean_o": m_o, "mean_e": m_e, "diff": diff, 
            "ci_low": ci_low, "ci_high": ci_high, "p": p_val, "es": cohens_d
        }

    # ==========================================
    # RUNNER
    # ==========================================
    def run_pair(self, text_o, text_e, pair_id):
        doc_o, sents_o, toks_o = self._preprocess(text_o)
        doc_e, sents_e, toks_e = self._preprocess(text_e)
        
        # Mapping metric name to (ScoreO, ScoreE)
        distro_metrics = {
             "Lexical (TTR)": (self.get_lexical_metrics(doc_o, sents_o), self.get_lexical_metrics(doc_e, sents_e)),
             "Readability (FKGL)": (self.get_readability_metrics(sents_o), self.get_readability_metrics(sents_e)),
             "Sentiment (Compound)": (self.get_sentiment_emotion(sents_o), self.get_sentiment_emotion(sents_e)),
             "Clickbait Score": (self.get_sensationalism(sents_o), self.get_sensationalism(sents_e)),
             "Moral Framing": (self.get_framing(doc_o), self.get_framing(doc_e)),
             "Toxicity": (self.get_toxicity(sents_o), self.get_toxicity(sents_e)),
             "Discourse Density": (self.get_discourse(doc_o), self.get_discourse(doc_e)),
             "Perplexity": (self.get_perplexity(sents_o), self.get_perplexity(sents_e))
        }
        
        all_stats = {}
        for name, (dist_o, dist_e) in distro_metrics.items():
            all_stats[name] = self._bootstrap_ci(dist_o, dist_e)

        # -- Single Value Metrics --
        # C) Stylometry: Burrows Delta
        burrows = self.get_burrows_delta(toks_o, toks_e)

        # H) Topic Shift (Entity Jaccard)
        ents_o = set(self.get_entities(doc_o))
        ents_e = set(self.get_entities(doc_e))
        u = len(ents_o.union(ents_e))
        jaccard = len(ents_o.intersection(ents_e))/u if u else 0
        
        # I) Semantic Sim
        if len(sents_o) > 0 and len(sents_e) > 0:
            emb_o = self.sbert.encode(sents_o)
            emb_e = self.sbert.encode(sents_e)
            sim_matrix = util.cos_sim(emb_o, emb_e).numpy()
            sem_sim = float(np.mean(np.max(sim_matrix, axis=1)))
        else:
            sem_sim = 0.0
        
        # K) Factuality (Number Jaccard)
        nums_o = self.get_factuality(text_o)
        nums_e = self.get_factuality(text_e)
        nu = len(nums_o.union(nums_e))
        num_jac = len(nums_o.intersection(nums_e))/nu if nu else 0

        single_stats = {
            "Style (Burrows Delta)": burrows,
            "Entity Jaccard": jaccard,
            "Semantic Similarity": sem_sim,
            "Numeric Jaccard": num_jac
        }

        # Viz Data for Radar
        viz_keys = ["Readability (FKGL)", "Sentiment (Compound)", "Toxicity", "Clickbait Score", "Moral Framing"]
        viz_data = {}
        for k in viz_keys:
             viz_data[k] = {'mean_o': all_stats[k]['mean_o'], 'mean_e': all_stats[k]['mean_e']}

        # Generate new visualizations
        self.plot_entity_heatmap(doc_o, doc_e, pair_id)
        self.plot_similarity_matrix(sents_o, sents_e, pair_id)
        self.plot_alignment_table(sents_o, sents_e, pair_id)

        return all_stats, single_stats, viz_data

    def visualize(self, viz_data, pair_id):
        # Radar Chart
        # Map nice labels to key names in viz_data
        key_map = [
            ("Readability", "Readability (FKGL)"),
            ("Sentiment", "Sentiment (Compound)"),
            ("Toxicity", "Toxicity"),
            ("Clickbait", "Clickbait Score"),
            ("Moral", "Moral Framing")
        ]
        
        labels = [x[0] for x in key_map]
        keys = [x[1] for x in key_map]
        
        means_o = [viz_data.get(k, {}).get('mean_o', 0) for k in keys]
        means_e = [viz_data.get(k, {}).get('mean_e', 0) for k in keys]
        
        # Normalize (Heuristic MinMax)
        norm_o = [
            min(max(means_o[0]/20, 0), 1), # fkgl 0-20
            (means_o[1]+1)/2, # compound -1 to 1 -> 0 to 1
            min(means_o[2], 1), # tox 0-1
            min(means_o[3], 1), # clickbait 0-1
            min(means_o[4]*5, 1) # moral often small, boost
        ]
        norm_e = [
            min(max(means_e[0]/20, 0), 1),
            (means_e[1]+1)/2,
            min(means_e[2], 1),
            min(means_e[3], 1),
            min(means_e[4]*5, 1)
        ]
        
        angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
        norm_o += norm_o[:1]; norm_e += norm_e[:1]; angles += angles[:1]
        
        plt.figure(figsize=(6,6))
        ax = plt.subplot(111, polar=True)
        ax.plot(angles, norm_o, label='Original')
        ax.fill(angles, norm_o, alpha=0.25)
        ax.plot(angles, norm_e, label='Rewrite')
        ax.fill(angles, norm_e, alpha=0.25)
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(labels)
        plt.title(f"Radar Comparison: {pair_id}")
        plt.legend()
        plt.savefig(f"{pair_id}_radar.png")
        plt.close()

    def get_gltr_hist(self, text_o, text_e, pair_id):
        if not self.gpt2_model: return
        
        def rank_text(txt):
            # Limit to 1000 chars for speed
            enc = self.gpt2_tokenizer(txt[:1000], return_tensors='pt')
            with torch.no_grad():
                logits = self.gpt2_model(enc.input_ids).logits
            ranks = []
            for i in range(len(enc.input_ids[0])-1):
                tid = enc.input_ids[0][i+1]
                args = torch.argsort(logits[0,i], descending=True)
                try: r = (args==tid).nonzero().item()
                except: r=10000
                ranks.append(r)
            return ranks

        ro = rank_text(text_o)
        re_ = rank_text(text_e)
        
        def bucket(r): return 1 if r<10 else 2 if r<100 else 3 if r<1000 else 4
        bo = [bucket(r) for r in ro]
        be = [bucket(r) for r in re_]
        
        co = [bo.count(i) for i in [1,2,3,4]]
        ce = [be.count(i) for i in [1,2,3,4]]
        
        x = np.arange(4)
        plt.bar(x-0.2, co, 0.4, label='Original')
        plt.bar(x+0.2, ce, 0.4, label='Rewrite')
        plt.xticks(x, ['Top 10', 'Top 100', 'Top 1k', '>1k'])
        plt.title(f"GLTR Distribution: {pair_id}")
        plt.legend()
        plt.savefig(f"{pair_id}_gltr_hist.png")
        plt.close()

    def plot_entity_heatmap(self, doc_o, doc_e, pair_id):
        ents_o = [e.text.lower() for e in doc_o.ents]
        ents_e = [e.text.lower() for e in doc_e.ents]
        
        # Get top 20 most common entities combined
        all_ents = ents_o + ents_e
        if not all_ents: return
        
        common = [w for w, c in Counter(all_ents).most_common(20)]
        
        # Count in each
        c_o = Counter(ents_o)
        c_e = Counter(ents_e)
        
        data = []
        for ent in common:
            data.append([c_o[ent], c_e[ent]])
            
        if not data: return
        
        plt.figure(figsize=(8, 10))
        sns.heatmap(data, annot=True, fmt="d", cmap="YlGnBu", 
                    yticklabels=common, xticklabels=["Original", "Rewrite"])
        plt.title(f"Entity Overlap Heatmap: {pair_id}")
        plt.tight_layout()
        plt.savefig(f"{pair_id}_entity_overlap.png")
        plt.close()

    def plot_similarity_matrix(self, sents_o, sents_e, pair_id):
        if not sents_o or not sents_e: return
        
        # Limit to first 20 sentences for readability if too large
        so = sents_o[:20]
        se = sents_e[:20]
        
        emb_o = self.sbert.encode(so, convert_to_tensor=True)
        emb_e = self.sbert.encode(se, convert_to_tensor=True)
        
        sim_matrix = util.cos_sim(emb_o, emb_e).cpu().numpy()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(sim_matrix, annot=False, cmap="YlGnBu", 
                    xticklabels=[f"E{i+1}" for i in range(len(se))],
                    yticklabels=[f"O{i+1}" for i in range(len(so))])
        plt.title(f"Sentence Similarity Matrix (Top 20): {pair_id}")
        plt.xlabel("Rewrite Sentences")
        plt.ylabel("Original Sentences")
        plt.tight_layout()
        plt.savefig(f"{pair_id}_similarity_matrix.png")
        plt.close()

    def plot_alignment_table(self, sents_o, sents_e, pair_id):
        aligned = self.align_sentences(sents_o, sents_e)
        if not aligned: return
        
        # Sort by score desc and take top 10
        aligned.sort(key=lambda x: x[2], reverse=True)
        top_10 = aligned[:10]
        
        # Prepare data for table
        cell_text = []
        for so, se, score in top_10:
            # Truncate for display
            so_trunc = (so[:50] + '...') if len(so) > 50 else so
            se_trunc = (se[:50] + '...') if len(se) > 50 else se
            cell_text.append([so_trunc, se_trunc, f"{score:.2f}"])
            
        if not cell_text: return
        
        plt.figure(figsize=(12, len(top_10)*0.8 + 1))
        ax = plt.gca()
        ax.axis('off')
        
        table = plt.table(cellText=cell_text, 
                          colLabels=["Original", "Rewrite", "Sim"], 
                          loc='center', cellLoc='left', colWidths=[0.4, 0.4, 0.1])
        
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.scale(1, 1.5)
        
        plt.title(f"Top 10 Aligned Sentences: {pair_id}")
        plt.tight_layout()
        plt.savefig(f"{pair_id}_alignment_table.png")
        plt.close()

In [5]:
def load_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

article_o = load_text("Articles/o-article1.txt")

article_e = load_text("Articles/e-article1.txt")

In [6]:
import os

comp = ComprehensiveComparator()

op = "Articles/o-article1.txt"
ep = "Articles/e-article1.txt"

if os.path.exists(op) and os.path.exists(ep):
    pair_id = os.path.basename(op).rsplit('.', 1)[0] + "_vs_" + os.path.basename(ep).rsplit('.', 1)[0]
    print(f"Processing {pair_id}...")
    
    to = load_text(op)
    te = load_text(ep)
    
    stats_res, singles, viz_data = comp.run_pair(to, te, pair_id)
    comp.visualize(viz_data, pair_id)
    comp.get_gltr_hist(to, te, pair_id)
    
    all_rows = []
    # Flatten to CSV
    for metric, s in stats_res.items():
        row = {"pair_id": pair_id, "metric": metric, 
                "mean_original": s['mean_o'], "mean_rewrite": s['mean_e'], 
                "diff": s['diff'], "boot_CI_low": s['ci_low'], "boot_CI_high": s['ci_high'],
                "p_value": s['p'], "effect_size": s['es']}
        all_rows.append(row)
    
    for metric, val in singles.items():
            row = {"pair_id": pair_id, "metric": metric, 
                "mean_original": val, "mean_rewrite": val, 
                "diff": 0, "boot_CI_low": 0, "boot_CI_high": 0,
                "p_value": 0, "effect_size": 0}
            all_rows.append(row)

    if all_rows:
        keys = all_rows[0].keys()
        with open("diff_summary.csv", "w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=keys)
            w.writeheader()
            w.writerows(all_rows)
        print("Done. Saved diff_summary.csv")
else:
    print("Error: One or both files not found.")

Initializing Models... (This may take a moment)
Processing o-article1_vs_e-article1...
Done. Saved diff_summary.csv


In [7]:
#FORMATED RESULTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from math import pi
import os

# ---------------------------------------------------------
# Helper Functions (Add this block to your script)
# ---------------------------------------------------------

def compute_bootstrap_ci(data_a, data_b, n_boot=1000, ci=95):
    """Calculates Bootstrap Confidence Interval for the difference of means."""
    data_a = np.array(data_a)
    data_b = np.array(data_b)
    diffs = []
    n = len(data_a)
    
    # Check if data is sufficient
    if n < 2: 
        return (0, 0)

    for _ in range(n_boot):
        # Resample with replacement
        sample_a = np.random.choice(data_a, n, replace=True)
        sample_b = np.random.choice(data_b, n, replace=True)
        diffs.append(np.mean(sample_b) - np.mean(sample_a))
    
    alpha = (100 - ci) / 2
    lower = np.percentile(diffs, alpha)
    upper = np.percentile(diffs, 100 - alpha)
    return (lower, upper)

def compute_cohens_d(x, y):
    """Calculates Cohen's d effect size."""
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    if dof < 1: return 0
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

def generate_final_outputs(df_orig, df_rewrite, entity_data, gltr_data, output_dir='.'):
    """
    Generates the requested CSV summary and Charts.
    
    Args:
        df_orig (pd.DataFrame): DataFrame where columns are metrics and rows are samples (Original text).
        df_rewrite (pd.DataFrame): DataFrame where columns are metrics and rows are samples (Rewrite text).
        entity_data (dict): Dictionary with keys 'original', 'rewrite', 'overlap' containing counts/scores.
        gltr_data (dict): Dictionary with keys 'original', 'rewrite' containing lists of GLTR categories (0,1,2,3).
        output_dir (str): Directory to save files.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # ---------------------------------------------------------
    # 1. Generate diff_summary.csv
    # ---------------------------------------------------------
    print("Generating diff_summary.csv...")
    summary_data = []
    
    # Identify numeric columns common to both
    metrics = [c for c in df_orig.columns if pd.api.types.is_numeric_dtype(df_orig[c])]
    
    for metric in metrics:
        orig_vals = df_orig[metric].dropna()
        rew_vals = df_rewrite[metric].dropna()
        
        if len(orig_vals) == 0 or len(rew_vals) == 0:
            continue

        mean_orig = np.mean(orig_vals)
        mean_rew = np.mean(rew_vals)
        diff = mean_rew - mean_orig
        
        # P-value (paired t-test if lengths match, otherwise independent)
        if len(orig_vals) == len(rew_vals):
            _, p_val = stats.ttest_rel(orig_vals, rew_vals)
        else:
            _, p_val = stats.ttest_ind(orig_vals, rew_vals, equal_var=False)
            
        # Bootstrap CI
        ci_lower, ci_upper = compute_bootstrap_ci(orig_vals, rew_vals)
        ci_str = f"[{ci_lower:.3f}, {ci_upper:.3f}]"
        
        # Effect Size (Cohen's d)
        eff_size = compute_cohens_d(rew_vals, orig_vals)
        
        summary_data.append({
            'metric': metric,
            'mean_original': mean_orig,
            'mean_rewrite': mean_rew,
            'diff': diff,
            'boot_CI': ci_str,
            'p': p_val,
            'effect_size': eff_size
        })
        
    df_summary = pd.DataFrame(summary_data)
    # Reorder columns as requested
    cols = ['metric', 'mean_original', 'mean_rewrite', 'diff', 'boot_CI', 'p', 'effect_size']
    df_summary = df_summary[cols]
    df_summary.to_csv(os.path.join(output_dir, 'diff_summary.csv'), index=False)
    print("diff_summary.csv saved.")

    # ---------------------------------------------------------
    # 2. Generate radar.png
    # ---------------------------------------------------------
    print("Generating radar.png...")
    # Normalize data for radar chart to 0-1 scale so metrics with different scales are visible
    categories = list(df_summary['metric'])
    N = len(categories)
    
    # Calculate means for plotting
    values_orig = df_summary['mean_original'].tolist()
    values_rew = df_summary['mean_rewrite'].tolist()
    
    # Close the loop for radar chart
    values_orig += values_orig[:1]
    values_rew += values_rew[:1]
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
    
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.plot(angles, values_orig, linewidth=1, linestyle='solid', label='Original')
    ax.fill(angles, values_orig, 'b', alpha=0.1)
    ax.plot(angles, values_rew, linewidth=1, linestyle='solid', label='Rewrite')
    ax.fill(angles, values_rew, 'r', alpha=0.1)
    
    plt.xticks(angles[:-1], categories)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Metric Comparison (Means)")
    plt.savefig(os.path.join(output_dir, 'radar.png'))
    plt.close()
    print("radar.png saved.")

    # ---------------------------------------------------------
    # 3. Generate entity_overlap.png
    # ---------------------------------------------------------
    print("Generating entity_overlap.png...")
    # Assuming entity_data is a dictionary like {'Original': 50, 'Rewrite': 45, 'Overlap': 30}
    # Or lists of entities. If counts are provided directly:
    
    plt.figure(figsize=(6, 4))
    if isinstance(entity_data, dict):
        keys = list(entity_data.keys())
        vals = list(entity_data.values())
        plt.bar(keys, vals, color=['skyblue', 'salmon', 'lightgreen'])
        plt.title("Entity Analysis")
        plt.ylabel("Count")
    plt.savefig(os.path.join(output_dir, 'entity_overlap.png'))
    plt.close()
    print("entity_overlap.png saved.")

    # ---------------------------------------------------------
    # 4. Generate gltr_hist.png
    # ---------------------------------------------------------
    print("Generating gltr_hist.png...")
    # gltr_data expected as {'original': [list of ranks/colors], 'rewrite': [list of ranks/colors]}
    # 0=Green (Top10), 1=Yellow (Top100), 2=Red (Top1000), 3=Purple (>1000)
    
    labels = ['Top 10 (Green)', 'Top 100 (Yellow)', 'Top 1k (Red)', '>1k (Purple)']
    
    # Helper to count frequencies
    def get_freqs(data_list):
        counts = [0, 0, 0, 0]
        for x in data_list:
            if 0 <= x < 4: counts[int(x)] += 1
        total = sum(counts) if sum(counts) > 0 else 1
        return [x/total for x in counts] # Return percentages

    orig_freqs = get_freqs(gltr_data.get('original', []))
    rew_freqs = get_freqs(gltr_data.get('rewrite', []))
    
    x = np.arange(len(labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(8, 5))
    rects1 = ax.bar(x - width/2, orig_freqs, width, label='Original')
    rects2 = ax.bar(x + width/2, rew_freqs, width, label='Rewrite')
    
    ax.set_ylabel('Proportion of Tokens')
    ax.set_title('GLTR Distribution (Token Probabilities)')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    plt.savefig(os.path.join(output_dir, 'gltr_hist.png'))
    plt.close()
    print("gltr_hist.png saved.")
    print("All outputs generated successfully.")

    # EXECUTION
    df_o = pd.DataFrame({'Fluency': np.random.rand(10), 'Coherence': np.random.rand(10)})
    df_r = pd.DataFrame({'Fluency': np.random.rand(10) + 0.1, 'Coherence': np.random.rand(10) - 0.1})
    
    ent_data = {'Original Entities': results["O"][""], 'Rewrite Entities': 90, 'Overlap': 60}
    
    # 0,1,2,3 correspond to Green, Yellow, Red, Purple buckets
    gl_data = {
        'original': results["O"]["gltr_ranks"],
        'rewrite':  results["E"]["gltr_ranks"]
    }

    generate_final_outputs(results['O'], results["E"], ent_data, gl_data)