In [1]:
import pandas as pd

In [2]:
datasets = ['iSarcasmEval', 'Sarcasm_Corpus_V2', 'Combined']

for dataset in datasets:
    # Load datasets
    df_train = pd.read_csv(f"{dataset}/train.csv")
    df_test = pd.read_csv(f"{dataset}/test.csv")
    df_val = pd.read_csv(f"{dataset}/val.csv")  # Corrected this line
    df_total = pd.concat([df_train, df_test, df_val])

    # Create a helper function to get stats
    def get_stats(df):
        counts = df['sarcastic'].value_counts().sort_index()
        total = counts.sum()
        return {
            "Sarcastic": counts.get(1, 0),
            "Non Sarcastic": counts.get(0, 0),
            "Total": total,
            "% Sarcastic": f"{(counts.get(1, 0) / total) * 100:.2f}%" if total else "0.00%",
            "% Non Sarcastic": f"{(counts.get(0, 0) / total) * 100:.2f}%" if total else "0.00%"
        }

    # Collect stats
    stats = {
        "Train": get_stats(df_train),
        "Validation": get_stats(df_val),
        "Test": get_stats(df_test),
        "Total": get_stats(df_total)
    }

    # Convert to DataFrame for better display
    stats_df = pd.DataFrame(stats).T  # Transpose for readability
    print(f"\n--- {dataset} Dataset Statistics ---")
    print(stats_df.to_string())



--- iSarcasmEval Dataset Statistics ---
           Sarcastic Non Sarcastic Total % Sarcastic % Non Sarcastic
Train            780          2341  3121      24.99%          75.01%
Validation        87           260   347      25.07%          74.93%
Test             200          1200  1400      14.29%          85.71%
Total           1067          3801  4868      21.92%          78.08%

--- Sarcasm_Corpus_V2 Dataset Statistics ---
           Sarcastic Non Sarcastic Total % Sarcastic % Non Sarcastic
Train           3377          3378  6755      49.99%          50.01%
Validation       376           377   753      49.93%          50.07%
Test             940           938  1878      50.05%          49.95%
Total           4693          4693  9386      50.00%          50.00%

--- Combined Dataset Statistics ---
           Sarcastic Non Sarcastic  Total % Sarcastic % Non Sarcastic
Train           4157          5719   9876      42.09%          57.91%
Validation       463           637   1100     