In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [None]:
main_df = pd.read_csv("political_data.csv")
main_df.head(3)

In [3]:
main_df["text"] = main_df["text"].apply(literal_eval)

In [4]:
main_df["num_speeches"].min(), main_df["num_speeches"].max(), main_df[
    "speakerid"
].nunique()

(1, 21142, 8520)

In [5]:
def filter_based_on_longer_texts(users_texts):
    MAX_THRESHOLD_TEXTS = 3700
    new_users_texts = []
    for user_texts in users_texts:

        user_texts_lengths = {
            i: len(user_texts[i].split()) for i in range(len(user_texts))
        }
        user_texts_lengths_sorted = sorted(
            user_texts_lengths.items(), key=lambda item: item[1], reverse=True
        )
        curr_len = 0
        new_user_texts = []
        curr_index = 0
        while curr_len < MAX_THRESHOLD_TEXTS and curr_index < len(
            user_texts_lengths_sorted
        ):
            if (
                curr_len + user_texts_lengths_sorted[curr_index][1]
                > MAX_THRESHOLD_TEXTS + 200
            ):
                curr_index += 1
                continue
            if user_texts_lengths_sorted[curr_index][1] > 1700:
                curr_index += 1
                continue
            new_user_texts.append(user_texts[user_texts_lengths_sorted[curr_index][0]])
            curr_len += user_texts_lengths_sorted[curr_index][1]
            curr_index += 1

        new_users_texts.append(new_user_texts)
    return new_users_texts

In [6]:
main_df.shape

(8521, 17)

In [7]:
df = main_df.copy()
df["text"] = filter_based_on_longer_texts(df["text"])

In [8]:
df["cohort"] = df["age"].apply(
    lambda x: (
        "27-40"
        if 27 <= x <= 40
        else "41-55" if 41 <= x <= 55 else "56-70" if 56 <= x <= 70 else "over 70"
    )
)

In [9]:
groups_sizes = df.groupby(["cohort", "party", "gender"]).size()
groups_sizes

cohort   party  gender
27-40    D      F           15
                M          368
         R      F           20
                M          362
41-55    D      F          202
                M         1656
         R      F          109
                M         1672
56-70    D      F          264
                M         1374
         R      F          120
                M         1302
over 70  D      F           73
                M          544
         R      F           25
                M          415
dtype: int64

In [10]:
final_df = None
for cohort, party, gender in groups_sizes.index:
    max_size = groups_sizes.loc[cohort, party, gender]
    curr_final_df = df[
        (df["cohort"] == cohort) & (df["party"] == party) & (df["gender"] == gender)
    ].sample(n=min(50, max_size), random_state=42, replace=False)
    if final_df is None:
        final_df = curr_final_df
    else:
        final_df = pd.concat([final_df, curr_final_df])

In [11]:
final_df = final_df[["speakerid", "text", "cohort", "party", "gender"]]

In [12]:
final_df

Unnamed: 0,speakerid,text,cohort,party,gender
5639,111116290,[Thank you. Chairman TANNER. for this opportun...,27-40,D,F
6303,104114320,[I just think it is very important for the Ame...,27-40,D,F
933,113121980,[Thank you. Representative LAMALFA. for your c...,27-40,D,F
7513,105121190,[Mr. Speaker. thank you for the opportunity to...,27-40,D,F
2399,106118520,[Mr. Speaker. I rise today to pay tribute to a...,27-40,D,F
...,...,...,...,...,...
5558,111115400,[Mr. Chair. I rise reluctantly to oppose H.R. ...,over 70,R,M
7621,107113550,[Mr. Chairman. I want to begin by thanking my ...,over 70,R,M
2231,106114880,[Mr. Chairman. I thank the gentleman for yield...,over 70,R,M
5695,111117000,[I thank the gentleman for yielding. and I wan...,over 70,R,M


In [64]:
all_speaker_ids = []
all_texts = []
all_cohorts = []
all_parties = []
all_genders = []
for index, row in final_df.iterrows():
    for text in row["text"]:
        all_speaker_ids.append(row["speakerid"])
        all_texts.append(text)
        all_cohorts.append(row["cohort"])
        all_parties.append(row["party"])
        all_genders.append(row["gender"])

final_df = pd.DataFrame(
    {
        "speakerid": all_speaker_ids,
        "text": all_texts,
        "cohort": all_cohorts,
        "party": all_parties,
        "gender": all_genders,
    }
)

In [66]:
final_df["text_lengths"] = final_df["text"].apply(lambda x: len(x.split()))

In [69]:
final_df = final_df[final_df["text_lengths"] > 100]

In [79]:
final_df["id"] = final_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["id"] = final_df.index


In [84]:
# final_df.to_csv("clean_data.csv", index=False)

In [1]:
import pandas as pd

In [12]:
df = pd.read_csv("clean_data_agg.csv")

In [13]:
df["cohort"].unique(), df["party"].unique(), df["gender"].unique()

(array(['over 70', '41-55', '27-40', '56-70'], dtype=object),
 array(['R', 'D'], dtype=object),
 array(['M', 'F'], dtype=object))

In [15]:
print(df["cohort"].value_counts()), print(df["party"].value_counts()), print(
    df["gender"].value_counts()
)

cohort
41-55      200
56-70      200
over 70    175
27-40      135
Name: count, dtype: int64
party
D    365
R    345
Name: count, dtype: int64
gender
M    400
F    310
Name: count, dtype: int64


(None, None, None)

In [18]:
df["speakerid"].nunique()

710