In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy, levene
from tqdm.auto import tqdm
import seaborn as sns
from nltk.tokenize import word_tokenize, sent_tokenize
import collections as coll
import math
import spacy
import os

In [2]:
all_liwc_features = [
    "Analytic",
    "Clout",
    "Authentic",
    "Tone",
    "WPS",
    "BigWords",
    "Dic",
    "Linguistic",
    "function",
    "pronoun",
    "ppron",
    "i",
    "we",
    "you",
    "shehe",
    "they",
    "ipron",
    "det",
    "article",
    "number",
    "prep",
    "auxverb",
    "adverb",
    "conj",
    "negate",
    "verb",
    "adj",
    "quantity",
    "Drives",
    "affiliation",
    "achieve",
    "power",
    "Cognition",
    "allnone",
    "cogproc",
    "insight",
    "cause",
    "discrep",
    "tentat",
    "certitude",
    "differ",
    "memory",
    "Affect",
    "tone_pos",
    "tone_neg",
    "emotion",
    "emo_pos",
    "emo_neg",
    "emo_anx",
    "emo_anger",
    "emo_sad",
    "swear",
    "Social",
    "socbehav",
    "prosocial",
    "polite",
    "conflict",
    "moral",
    "comm",
    "socrefs",
    "family",
    "friend",
    "female",
    "male",
    "Culture",
    "politic",
    "ethnicity",
    "tech",
    "Lifestyle",
    "leisure",
    "home",
    "work",
    "money",
    "relig",
    "Physical",
    "health",
    "illness",
    "wellness",
    "mental",
    "substances",
    "sexual",
    "food",
    "death",
    "need",
    "want",
    "acquire",
    "lack",
    "fulfill",
    "fatigue",
    "reward",
    "risk",
    "curiosity",
    "allure",
    "Perception",
    "attention",
    "motion",
    "space",
    "visual",
    "auditory",
    "feeling",
    "time",
    "focuspast",
    "focuspresent",
    "focusfuture",
    "Conversation",
    "netspeak",
    "assent",
    "nonflu",
    "filler",
    "AllPunc",
    "Period",
    "Comma",
    "QMark",
    "Exclam",
    "Apostro",
    "OtherP",
    "Emoji",
]

complexity_features = [
    "avg_dependency_link_length",
    "type_token_ratio",
    "hapax_legemena",
    "shannon_entropy",
    "simpsons_index",
]

In [3]:
DATASET_TO_DATASET_NAME = {
    "essays": "Essays",
    "wassa": "Empathetic.",
    "facebook": "YourMorals",
    "political": "Congress",
}

BASELINES = {
    "CON": 0.5110732538330494,
    "EXT": 0.5157580919931857,
    "AGR": 0.530664395229983,
    "NEU": 0.5008517887563884,
    "OPN": 0.5161839863713799,
    ".care": 0.5070035704476792,
    ".purity": 0.5358418017028289,
    ".fairness": 0.5281516067014557,
    ".loyalty": 0.5034331227684702,
    ".authority": 0.5339192529524857,
    ".iri.concern": 0.5724331926863573,
    ".iri.distress": 0.5175808720112518,
    ".iri.fantasy": 0.5471167369901547,
    ".iri.perspective": 0.5274261603375527,
    "gender": 0.5,
    "cohort": 0.25,
    "party": 0.5,
}


FROM_CURRENT_TO_ORDERED_AGE_GROUP_INDICES = {
    0: 3,
    1: 1,
    2: 0,
    3: 2,
}


DATASET_TO_BASELINES = {
    "essays": np.mean([BASELINES[x] for x in ["CON", "EXT", "AGR", "NEU", "OPN"]]),
    "wassa": np.mean(
        [
            BASELINES[x]
            for x in [
                ".iri.concern",
                ".iri.distress",
                ".iri.fantasy",
                ".iri.perspective",
            ]
        ]
    ),
    "wassa_individual": np.mean(
        [
            BASELINES[x]
            for x in [
                ".iri.concern",
                ".iri.distress",
                ".iri.fantasy",
                ".iri.perspective",
            ]
        ]
    ),
    "facebook": np.mean(
        [
            BASELINES[x]
            for x in [".care", ".purity", ".fairness", ".loyalty", ".authority"]
        ]
    ),
    "party": BASELINES["party"],
    "gender": BASELINES["gender"],
    "cohort": BASELINES["cohort"],
}

REWRITTEN_TYPE_TO_SHORT = {
    "syntax_grammar": "SG",
    "rephrase": "R",
}

LLM_TO_NAME = {
    "original": "Original",
    "gpt": "GPT3.5",
    "llama": "Llama 2",
    "gemini": "Gemini",
}

LABELS_TO_NAME = {
    ".iri.concern": "Concern",
    ".iri.distress": "Distress",
    ".iri.perspective": "Perspective",
    ".iri.fantasy": "Fantasy",
    ".authority": "Authority",
    ".care": "Care",
    ".fairness": "Fairness",
    ".loyalty": "Loyalty",
    ".purity": "Purity",
    "CON": "CON",
    "NEU": "NEU",
    "EXT": "EXT",
    "AGR": "AGR",
    "OPN": "OPN",
    "gender": "Gender",
    "cohort": "Age group",
    "party": "Party",
}

In [4]:
personality_columns = [
    "nrc.negative",
    "emo_pos",
    "affect",
    "nrc.positive",
    "affiliation",
    "emotion",
    "nrc.joy",
    "social",
    "emo_sad",
    "nrc.anticipation",
    "nrc.anger",
    "pronoun",
    "i",
    "emo_anger",
    "swear",
    "bigwords",
    "emo_neg",
    "nrc.disgust",
    "nrc.sadness",
    "nrc.trust",
]

In [5]:
facebook_columns = [
    "we",
    "socrefs",
    "pronoun",
    "friend",
    "mfd.authority.virtue",
    "mfd.care.virtue",
    "affect",
    "i",
    "mfd.authority.vice",
    "mfd.purity.vice",
    "affiliation",
    "prosocial",
    "family",
    "religion",
    "mfd.purity.virtue",
    "you",
    "social",
]

In [6]:
wassa_columns = [
    "affect",
    "differ",
    "we",
    "she,he",
    "pronoun",
    "emo_neg",
    "tentat",
    "empathy.low",
    "pronoun",
    "cogproc",
    "distress.low",
]

In [7]:
political_columns = [
    "mfd.authority.virtue",
    "mfd.authority.vice",
    "mfd.loyalty.virtue",
    "mfd.loyalty.vice",
    "mfd.fairness.virtue",
    "mfd.fairness.vice",
    "emo_anx",
    "adverb",
    "conj",
    "emo_neg",
    "emo_anger",
    "we",
    "relig",
    "swear",
    "i",
    "cogproc",
    "emo_pos",
    "certitude",
]

gender_columns = [
    "article",
    "social",
    "emo_anx",
    "pronoun",
    "i",
    "emo_pos",
    "emo_neg",
    "affect",
    "tentat",
    "motion",
    "swear",
    "quant",
    "number",
    "space",
    "cogproc",
]

age_columns = [
    "we",
    "cogproc",
    "prep",
    "article",
    "social",
    "focusfuture",
    "focuspast",
    "emo_neg",
    "emo_pos",
    "i",
]

In [8]:
nlp = spacy.load("en_core_web_sm")


def RemoveSpecialCHs(text):
    text = word_tokenize(text)
    st = [
        ",",
        ".",
        "'",
        "!",
        '"',
        "#",
        "$",
        "%",
        "&",
        "(",
        ")",
        "*",
        "+",
        "-",
        ".",
        "/",
        ":",
        ";",
        "<",
        "=",
        ">",
        "?",
        "@",
        "[",
        "\\",
        "]",
        "^",
        "_",
        "`",
        "{",
        "|",
        "}",
        "~",
        "\t",
        "\n",
    ]

    words = [word for word in text if word not in st]
    return words


def compute_avg_dependency_link_length(text):
    try:
        doc = nlp(text)
        link_lengths = []
        for sent in doc.sents:
            sent_link_lengths = []
            for token in sent:
                if token.dep_ != "ROOT":
                    head = token.head
                    sent_link_lengths.append(abs(head.i - token.i))
            if sent_link_lengths:  # Only append if sentence had any links
                link_lengths.append(np.mean(sent_link_lengths))
        return np.mean(link_lengths)
    except:
        return np.nan


def typeTokenRatio(text):
    try:
        words = word_tokenize(text)
        return len(set(words)) / len(words)
    except:
        return np.nan


def hapaxLegemena(text):
    try:
        words = RemoveSpecialCHs(text)
        V1 = 0
        # dictionary comprehension . har word kay against value 0 kardi
        freqs = {key: 0 for key in words}
        for word in words:
            freqs[word] += 1
        for word in freqs:
            if freqs[word] == 1:
                V1 += 1
        N = len(words)
        V = float(len(set(words)))
        R = 100 * math.log(N) / max(1, (1 - (V1 / V)))
        h = V1 / N
        return R, h
    except:
        return np.nan, np.nan


def ShannonEntropy(text):
    try:
        words = RemoveSpecialCHs(text)
        lenght = len(words)
        freqs = coll.Counter()
        freqs.update(words)
        arr = np.array(list(freqs.values()))
        distribution = 1.0 * arr
        distribution /= max(1, lenght)
        import scipy as sc

        H = sc.stats.entropy(distribution, base=2)
        # H = sum([(i/lenght)*math.log(i/lenght,math.e) for i in freqs.values()])
        return H
    except:
        return np.nan


def SimpsonsIndex(text):
    try:
        words = RemoveSpecialCHs(text)
        freqs = coll.Counter()
        freqs.update(words)
        N = len(words)
        n = sum([1.0 * i * (i - 1) for i in freqs.values()])
        D = 1 - (n / (N * (N - 1)))
        return D
    except:
        return np.nan

In [9]:
def add_complexity_features(df):
    df["avg_dependency_link_length"] = [
        compute_avg_dependency_link_length(text)
        for text in tqdm(df["text"], leave=False)
    ]

    df["type_token_ratio"] = [
        typeTokenRatio(text) for text in tqdm(df["text"], leave=False)
    ]
    df["hapax_legemena"] = [
        hapaxLegemena(text)[1] for text in tqdm(df["text"], leave=False)
    ]
    df["shannon_entropy"] = [
        ShannonEntropy(text) for text in tqdm(df["text"], leave=False)
    ]
    df["simpsons_index"] = [
        SimpsonsIndex(text) for text in tqdm(df["text"], leave=False)
    ]
    return df


def add_complexity_features_to_dataset(dataset):
    dataset.set_original_data(add_complexity_features(dataset.get_original_data()))
    for llm in tqdm(["gpt", "llama", "gemini"], leave=False):
        for mode in tqdm(["syntax_grammar", "rephrase"], leave=False):
            dataset.set_rewritten_data(
                mode,
                llm,
                add_complexity_features(dataset.get_rewritten_data(mode, llm)),
            )

    return dataset


def does_dataset_have_complexity_features(dataset):
    return all(
        [
            feature in dataset.get_original_data().columns
            for feature in complexity_features
        ]
    )

In [10]:
class Dataset:
    def __init__(self) -> None:
        self.filtered_columns = all_liwc_features + complexity_features
        self.rewritten_datasets = {}

    def get_original_data(self):
        return self.original_data

    def set_original_data(self, data):
        self.original_data = data
        data.to_csv(self.original_data_path, index=False)

    def get_rewritten_data(self, n, m):
        if self.rewritten_datasets.get((n, m)) is None:
            self.rewritten_datasets[(n, m)] = pd.read_csv(
                self.rewritten_data_path.format(n, m)
            )
        return self.rewritten_datasets[(n, m)]

    def set_rewritten_data(self, n, m, data):
        self.rewritten_datasets[(n, m)] = data
        data.to_csv(self.rewritten_data_path.format(n, m), index=False)


class Essays(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.original_data_path = "essays/with_dictionaries/LIWC-22 Results - essays_anon_full - LIWC Analysis.csv"
        self.rewritten_data_path = "essays/with_dictionaries/LIWC-22 Results - essays_rewritten_{}_{} - LIWC Analysis.csv"
        self.original_data = pd.read_csv(self.original_data_path)
        self.id_column = "#AUTHID"
        self.filtered_columns = self.filtered_columns + [
            col for col in self.original_data.columns if "nrc." in col
        ]
        self.name = "essays"

    def get_original_data(self):
        # call the parent class method to get the original data
        return super().get_original_data()

    def get_rewritten_data(self, n, m):
        return super().get_rewritten_data(n, m)


class Wassa(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.original_data_path = (
            "wassa/with_dictionaries/LIWC-22 Results - clean_wassa - LIWC Analysis.csv"
        )
        self.rewritten_data_path = "wassa/with_dictionaries/LIWC-22 Results - wassa_rewritten_{}_{} - LIWC Analysis.csv"
        self.original_data = pd.read_csv(self.original_data_path)
        self.id_column = "id"
        self.filtered_columns = self.filtered_columns + [
            col
            for col in self.original_data.columns
            if col.split(".") in ["distress", "empathy"]
        ]
        self.name = "wassa"

    def get_original_data(self):
        return super().get_original_data()

    def get_rewritten_data(self, n, m):
        return super().get_rewritten_data(n, m)


class Political(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.original_data_path = "political/with_dictionaries_both/LIWC-22 Results - clean_data_agg - LIWC Analysis.csv"
        self.rewritten_data_path = "political/with_dictionaries_both/LIWC-22 Results - political_rewritten_{}_{} - LIWC Analysis.csv"
        self.original_data = pd.read_csv(self.original_data_path)
        self.id_column = "speakerid"
        self.name = "political"

    def get_original_data(self):
        return super().get_original_data()

    def get_rewritten_data(self, n, m):
        return super().get_rewritten_data(n, m)


class Facebook(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.original_data_path = "facebook/with_dictionaries/LIWC-22 Results - full_dataset_clean - LIWC Analysis.csv"
        self.rewritten_data_path = "facebook/with_dictionaries/LIWC-22 Results - facebook_rewritten_{}_{} - LIWC Analysis.csv"
        self.original_data = pd.read_csv(self.original_data_path)
        self.id_column = "subject_id"
        self.filtered_columns = self.filtered_columns + [
            col for col in self.original_data.columns if "mfd." in col
        ]
        self.name = "facebook"

    def get_original_data(self):
        return super().get_original_data()

    def get_rewritten_data(self, n, m):
        return super().get_rewritten_data(n, m)

In [11]:
essays = Essays()
wassa = Wassa()
political = Political()
facebook = Facebook()

if not does_dataset_have_complexity_features(essays):
    print("Processing Essays dataset with the complexity features")
    essays = add_complexity_features_to_dataset(essays)
if not does_dataset_have_complexity_features(wassa):
    print("Processing Wassa dataset with the complexity features")
    wassa = add_complexity_features_to_dataset(wassa)
if not does_dataset_have_complexity_features(political):
    print("Processing Political dataset with the complexity features")
    political = add_complexity_features_to_dataset(political)
if not does_dataset_have_complexity_features(facebook):
    print("Processing Facebook dataset with the complexity features")
    facebook = add_complexity_features_to_dataset(facebook)

In [12]:
def get_agg_features_and_entropies():
    llms = []
    modes = []
    cols = []
    # entropy_values = []
    distribution_values = []
    index_values = []
    datasets = []
    dataset_original_or_not = []
    for dataset in tqdm([essays, wassa, political, facebook], leave=False):
        for llm in tqdm(["gpt", "llama", "gemini"], leave=False):
            for mode in tqdm(["syntax_grammar", "rephrase"], leave=False):
                for col in tqdm(dataset.filtered_columns, leave=False):

                    original_data = dataset.get_original_data()
                    rewritten_data = dataset.get_rewritten_data(mode, llm)

                    original_data = original_data[original_data[col].notna()]
                    rewritten_data = rewritten_data[rewritten_data[col].notna()]

                    shared_ids = list(
                        set.intersection(
                            set(original_data[dataset.id_column].tolist()),
                            set(rewritten_data[dataset.id_column].tolist()),
                        )
                    )

                    original_data = original_data[
                        original_data[dataset.id_column].isin(shared_ids)
                    ]
                    rewritten_data = rewritten_data[
                        rewritten_data[dataset.id_column].isin(shared_ids)
                    ]

                    original_data = original_data.set_index(
                        dataset.id_column
                    ).sort_index()[col]

                    rewritten_data = rewritten_data.set_index(
                        dataset.id_column
                    ).sort_index()[col]

                    shared_ids = list(
                        set.intersection(
                            set(original_data.index.tolist()),
                            set(rewritten_data.index.tolist()),
                        )
                    )

                    original_data_normalized = original_data
                    rewritten_data_normalized = rewritten_data

                    for is_original in [True, False]:
                        dataset_original_or_not.append(is_original)
                        if is_original:

                            distribution_values.append(
                                original_data_normalized.tolist()
                            )
                        else:

                            distribution_values.append(
                                rewritten_data_normalized.tolist()
                            )
                        index_values.append(shared_ids)
                        llms.append(llm)
                        modes.append(mode)
                        cols.append(col)
                        datasets.append(
                            "essays"
                            if isinstance(dataset, Essays)
                            else (
                                "wassa"
                                if isinstance(dataset, Wassa)
                                else (
                                    "political"
                                    if isinstance(dataset, Political)
                                    else "facebook"
                                )
                            )
                        )
    return (
        llms,
        modes,
        cols,
        datasets,
        dataset_original_or_not,
        # entropy_values,
        distribution_values,
        index_values,
    )

In [None]:
(
    llms,
    modes,
    cols,
    datasets,
    dataset_original_or_not,
    # entropy_values,
    distribution_values,
    index_values,
) = get_agg_features_and_entropies()
df = pd.DataFrame(
    {
        "llm": llms,
        "mode": modes,
        "col": cols,
        "dataset": datasets,
        "is_original": dataset_original_or_not,
        # "entropy": entropy_values,
        "distribution": distribution_values,
        "index": index_values,
    }
)
df["col"] = df["col"].str.lower()

In [14]:
complexity_features

['avg_dependency_link_length',
 'type_token_ratio',
 'hapax_legemena',
 'shannon_entropy',
 'simpsons_index']

In [15]:
df = df[df["col"].isin(complexity_features)]

filtered_llms = []
filtered_modes = []
filtered_datasets = []
filtered_is_originals = []
mean_values = []
avg_dep_values = []
type_token_values = []
hapax_values = []
shannon_values = []
simpsons_values = []

for llm in df["llm"].unique():
    for mode in df["mode"].unique():
        for dataset in df["dataset"].unique():
            for is_original in [True, False]:

                sub_df = df[
                    (df["llm"] == llm)
                    & (df["mode"] == mode)
                    & (df["dataset"] == dataset)
                    & (df["is_original"] == is_original)
                ]
                avg_dep_subdf = pd.DataFrame(
                    {
                        "value_avg_dep": sub_df[
                            sub_df["col"] == "avg_dependency_link_length"
                        ]["distribution"].values[0],
                        "index": sub_df[sub_df["col"] == "avg_dependency_link_length"][
                            "index"
                        ].values[0],
                    }
                )
                type_token_subdf = pd.DataFrame(
                    {
                        "value_type_token": sub_df[sub_df["col"] == "type_token_ratio"][
                            "distribution"
                        ].values[0],
                        "index": sub_df[sub_df["col"] == "type_token_ratio"][
                            "index"
                        ].values[0],
                    }
                )
                hapax_subdf = pd.DataFrame(
                    {
                        "value_hapax": sub_df[sub_df["col"] == "hapax_legemena"][
                            "distribution"
                        ].values[0],
                        "index": sub_df[sub_df["col"] == "hapax_legemena"][
                            "index"
                        ].values[0],
                    }
                )
                shannon_subdf = pd.DataFrame(
                    {
                        "value_shannon": sub_df[sub_df["col"] == "shannon_entropy"][
                            "distribution"
                        ].values[0],
                        "index": sub_df[sub_df["col"] == "shannon_entropy"][
                            "index"
                        ].values[0],
                    }
                )
                simpsons_subdf = pd.DataFrame(
                    {
                        "value_simpsons": sub_df[sub_df["col"] == "simpsons_index"][
                            "distribution"
                        ].values[0],
                        "index": sub_df[sub_df["col"] == "simpsons_index"][
                            "index"
                        ].values[0],
                    }
                )

                merged_df = (
                    pd.merge(
                        avg_dep_subdf,
                        type_token_subdf,
                        on="index",
                    )
                    .merge(hapax_subdf, on="index")
                    .merge(
                        shannon_subdf,
                        on="index",
                    )
                    .merge(
                        simpsons_subdf,
                        on="index",
                    )
                )

                # get the mean of all the value_* columns normalized values
                from sklearn.preprocessing import StandardScaler

                scaler = StandardScaler()
                value_avg_dep_normalized = scaler.fit_transform(
                    merged_df["value_avg_dep"].values.reshape(-1, 1)
                ).flatten()
                scaler = StandardScaler()
                value_type_token_normalized = scaler.fit_transform(
                    merged_df["value_type_token"].values.reshape(-1, 1)
                ).flatten()
                scaler = StandardScaler()
                value_hapax_normalized = scaler.fit_transform(
                    merged_df["value_hapax"].values.reshape(-1, 1)
                ).flatten()
                scaler = StandardScaler()
                value_shannon_normalized = scaler.fit_transform(
                    merged_df["value_shannon"].values.reshape(-1, 1)
                ).flatten()
                scaler = StandardScaler()
                value_simpsons_normalized = scaler.fit_transform(
                    merged_df["value_simpsons"].values.reshape(-1, 1)
                ).flatten()
                merged_df["value"] = np.mean(
                    [
                        value_avg_dep_normalized,
                        value_type_token_normalized,
                        value_hapax_normalized,
                        value_shannon_normalized,
                        value_simpsons_normalized,
                    ],
                    axis=0,
                )

                mean_values.append(merged_df["value"].values)
                avg_dep_values.append(merged_df["value_avg_dep"].values)
                type_token_values.append(merged_df["value_type_token"].values)
                hapax_values.append(merged_df["value_hapax"].values)
                shannon_values.append(merged_df["value_shannon"].values)
                simpsons_values.append(merged_df["value_simpsons"].values)
                filtered_llms.append(llm)
                filtered_modes.append(mode)
                filtered_datasets.append(dataset)
                filtered_is_originals.append(is_original)

filtered_df = pd.DataFrame(
    {
        "llm": filtered_llms,
        "mode": filtered_modes,
        "dataset": filtered_datasets,
        "is_original": filtered_is_originals,
        "mean": mean_values,
        "avg_dep": avg_dep_values,
        "type_token": type_token_values,
        "hapax": hapax_values,
        "shannon": shannon_values,
        "simpsons": simpsons_values,
    }
)

In [40]:
from IPython.display import display


for col in [
    "mean",
]:
    print("---------------------------------")
    print("analysis for: ", col)

    final_datasets = []
    final_llms = []
    final_modes = []
    final_significances = []
    final_stats = []

    for dataset in filtered_df["dataset"].unique():
        for llm in filtered_df["llm"].unique():
            for mode in filtered_df["mode"].unique():
                final_datasets.append(dataset)
                final_llms.append(llm)
                final_modes.append(mode)
                original_values = filtered_df[
                    (filtered_df["dataset"] == dataset)
                    & (filtered_df["llm"] == llm)
                    & (filtered_df["mode"] == mode)
                    & (filtered_df["is_original"] == True)
                ][col].values[0]
                rewritten_values = filtered_df[
                    (filtered_df["dataset"] == dataset)
                    & (filtered_df["llm"] == llm)
                    & (filtered_df["mode"] == mode)
                    & (filtered_df["is_original"] == False)
                ][col].values[0]
                stats, p = levene(original_values, rewritten_values)
                p = np.round(p, 3)
                stats = np.round(stats, 3)
                final_stats.append(stats)
                original_vars = np.round(np.var(original_values), 4)
                rewritten_vars = np.round(np.var(rewritten_values), 4)
                if p <= 0.001:
                    final_significances.append(
                        f"{original_vars} -> {rewritten_vars} ({p})*"
                    )
                else:
                    final_significances.append(
                        f"{original_vars} -> {rewritten_vars} ({p})"
                    )

    final_df = pd.DataFrame(
        {
            "dataset": final_datasets,
            "llm": final_llms,
            "mode": final_modes,
            "significance": final_significances,
            "statistic": final_stats,
        }
    )

    display(
        final_df.pivot_table(
            index=["llm", "mode"],
            columns="dataset",
            values=["significance", "statistic"],
            aggfunc=lambda x: x,
        ).reindex(
            columns=[
                ("significance", "essays"),
                ("statistic", "essays"),
                ("significance", "facebook"),
                ("statistic", "facebook"),
                ("significance", "political"),
                ("statistic", "political"),
                ("significance", "wassa"),
                ("statistic", "wassa"),
            ]
        )
    )
    print(
        final_df.pivot_table(
            index=["llm", "mode"],
            columns="dataset",
            values=["significance", "statistic"],
            aggfunc=lambda x: x,
        )
        .reindex(
            columns=[
                ("significance", "essays"),
                ("statistic", "essays"),
                ("significance", "facebook"),
                ("statistic", "facebook"),
                ("significance", "political"),
                ("statistic", "political"),
                ("significance", "wassa"),
                ("statistic", "wassa"),
            ]
        )
        .to_latex()
    )

---------------------------------
analysis for:  mean


Unnamed: 0_level_0,Unnamed: 1_level_0,significance,statistic,significance,statistic,significance,statistic,significance,statistic
Unnamed: 0_level_1,dataset,essays,essays,facebook,facebook,political,political,wassa,wassa
llm,mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gemini,rephrase,0.0151 -> 0.0112 (0.0)*,39.233,0.0092 -> 0.0074 (0.0)*,119.808,0.0197 -> 0.0083 (0.0)*,131.365,0.0079 -> 0.0071 (0.958),0.003
gemini,syntax_grammar,0.0151 -> 0.0089 (0.0)*,103.269,0.0092 -> 0.0094 (0.0)*,36.435,0.0197 -> 0.0123 (0.0)*,37.274,0.0078 -> 0.0075 (0.826),0.048
gpt,rephrase,0.0151 -> 0.0152 (0.216),1.53,0.0092 -> 0.006 (0.0)*,93.474,0.0197 -> 0.0108 (0.0)*,65.211,0.008 -> 0.0052 (0.164),1.959
gpt,syntax_grammar,0.0151 -> 0.0153 (0.17),1.88,0.0092 -> 0.0075 (0.0)*,63.159,0.0197 -> 0.0118 (0.0)*,47.236,0.008 -> 0.0069 (0.407),0.693
llama,rephrase,0.015 -> 0.0159 (0.63),0.232,0.0083 -> 0.0073 (0.13),2.299,0.0197 -> 0.0138 (0.0)*,62.6,0.008 -> 0.0062 (0.532),0.393
llama,syntax_grammar,0.015 -> 0.0124 (0.0)*,127.087,0.0082 -> 0.007 (0.004),8.395,0.0197 -> 0.0152 (0.0)*,36.412,0.008 -> 0.0105 (0.372),0.802


\begin{tabular}{lllrlrlrlr}
\toprule
 &  & significance & statistic & significance & statistic & significance & statistic & significance & statistic \\
 & dataset & essays & essays & facebook & facebook & political & political & wassa & wassa \\
llm & mode &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{gemini} & rephrase & 0.0151 -> 0.0112 (0.0)* & 39.233000 & 0.0092 -> 0.0074 (0.0)* & 119.808000 & 0.0197 -> 0.0083 (0.0)* & 131.365000 & 0.0079 -> 0.0071 (0.958) & 0.003000 \\
 & syntax_grammar & 0.0151 -> 0.0089 (0.0)* & 103.269000 & 0.0092 -> 0.0094 (0.0)* & 36.435000 & 0.0197 -> 0.0123 (0.0)* & 37.274000 & 0.0078 -> 0.0075 (0.826) & 0.048000 \\
\cline{1-10}
\multirow[t]{2}{*}{gpt} & rephrase & 0.0151 -> 0.0152 (0.216) & 1.530000 & 0.0092 -> 0.006 (0.0)* & 93.474000 & 0.0197 -> 0.0108 (0.0)* & 65.211000 & 0.008 -> 0.0052 (0.164) & 1.959000 \\
 & syntax_grammar & 0.0151 -> 0.0153 (0.17) & 1.880000 & 0.0092 -> 0.0075 (0.0)* & 63.159000 & 0.0197 -> 0.0118 (0.0)* & 47.236000 & 0.00