In [11]:
import pandas as pd
import numpy as np
import json
import os

In [12]:
def aggregate_data(data_to_agg, output_file_name):
    data_to_agg.set_index("id", inplace=True)
    data_to_agg.sort_index(inplace=True)
    data_to_agg.reset_index(inplace=True, drop=False)
    text_lengths = data_to_agg["text"].apply(lambda x: len(x.split()))
    print("Average text length:", np.mean(text_lengths))
    new_df = (
        data_to_agg.groupby("speakerid")["text"]
        .apply(lambda x: ".\n".join(x))
        .reset_index()
    )
    for col in ["cohort", "party", "gender"]:
        new_df = pd.merge(
            new_df,
            data_to_agg[[col, "speakerid"]].drop_duplicates(),
            on="speakerid",
            how="left",
        )
    print(new_df.shape)
    text_lengths = new_df["text"].apply(lambda x: len(x.split()))
    print("Average text length:", np.mean(text_lengths))

    new_df.to_csv(output_file_name, index=False)
    return new_df

In [13]:
main_df = pd.read_csv("clean_data.csv")
main_df["id"] = main_df["id"].astype(str)

new_df = aggregate_data(main_df, "clean_data_agg.csv")
new_df.head(3)

Average text length: 829.5143297380586
(710, 5)
Average text length: 3791.230985915493


Unnamed: 0,speakerid,text,cohort,party,gender
0,97105451,Mr. President. I am introducing today a bill t...,over 70,R,M
1,97105580,Mr. Speaker. today I am introducing legislatio...,41-55,R,F
2,97105760,Mr. Speaker. reserving the right to object. I ...,over 70,R,M


In [14]:
for file in [
    "political_rewritten_rephrase_gemini_cleaned.json",
    "political_rewritten_syntax_grammar_gemini_cleaned.json",
    "political_rewritten_rephrase_gpt_cleaned.json",
    "political_rewritten_syntax_grammar_gpt_cleaned.json",
    "political_rewritten_rephrase_llama_cleaned.json",
    "political_rewritten_syntax_grammar_llama_cleaned.json",
]:
    with open(file, "r") as f:
        data = json.load(f)

    rewritten_df = pd.DataFrame.from_records(
        list(zip(data.keys(), data.values())), columns=["id", "text"]
    )
    print("Shape of the rewritten data:", rewritten_df.shape)
    for col in ["cohort", "party", "gender", "speakerid"]:
        rewritten_df = pd.merge(
            rewritten_df,
            main_df[[col, "id"]].drop_duplicates(),
            on="id",
            how="left",
        )
    print("Shape of the rewritten data:", rewritten_df.shape)
    new_df = aggregate_data(rewritten_df, file.replace(".json", ".csv"))
    print("Shape of the aggregated data:", new_df.shape)

Shape of the rewritten data: (3197, 2)
Shape of the rewritten data: (3197, 6)
Average text length: 258.98310916484206
(710, 5)
Average text length: 1166.5056338028169
Shape of the aggregated data: (710, 5)
Shape of the rewritten data: (3195, 2)
Shape of the rewritten data: (3195, 6)
Average text length: 589.8378716744913
(710, 5)
Average text length: 2654.574647887324
Shape of the aggregated data: (710, 5)
Shape of the rewritten data: (3245, 2)
Shape of the rewritten data: (3245, 6)
Average text length: 291.71833590138675
(710, 5)
Average text length: 1333.2760563380282
Shape of the aggregated data: (710, 5)
Shape of the rewritten data: (3245, 2)
Shape of the rewritten data: (3245, 6)
Average text length: 473.71032357473035
(710, 5)
Average text length: 2165.056338028169
Shape of the aggregated data: (710, 5)
Shape of the rewritten data: (3241, 2)
Shape of the rewritten data: (3241, 6)
Average text length: 280.071582844801
(710, 5)
Average text length: 1281.0718309859155
Shape of the a