In [1]:
import os
import shutil
import glob
import pandas as pd
from collections import Counter
import numpy as np
from scipy import stats

In [2]:
directory = "../data/episodes"

In [None]:
df = pd.read_csv("../data/episodeLevelData.csv")

In [None]:
category1_counts = df['category1'].value_counts()
print(category1_counts)

In [None]:
# Export dates for entire corpus for time series comparison (RQ1)
conn = sqlite3.connect('../data/data.db')
df = pd.read_sql_query("SELECT episodeDateLocalized FROM podcast_episodes", conn)
df.to_csv('../data/all_episodes_dates.csv', index=False)
conn.close()

In [3]:
# norj_files = os.listdir("../data/norjepisodes")
# norj_ids = [os.path.splitext(f)[0] for f in norj_files]
# filtered_df = df[~df["epID"].astype(str).isin(norj_ids)]
# filtered_df.to_csv("../data/episodeLevelData_139.csv", index=False)
# print(f"Saved filtered file with {len(filtered_df)} rows.")

In [2]:
# race_counts = Counter()

# for filename in os.listdir(directory):
#     if filename.endswith(".csv"):
#         file_path = os.path.join(directory, filename)
#         try:
#             df = pd.read_csv(file_path)
#             if 'collectiveAction' in df.columns and 'race' in df.columns:
#                 subset = df[df['collectiveAction'] == 0]
#                 #race_counts.update(subset['race'].dropna())
#                 race_counts.update(subset['race'])  # NaN will be counted as a key
#         except Exception as e:
#             print(f"Error reading {filename}: {e}")

# race_summary = pd.DataFrame.from_dict(race_counts, orient='index', columns=['Count']).sort_values(by='Count', ascending=False)
# print(race_summary)

### Proportion of statements of collective action per episode

In [None]:
results = []

for file in glob.glob(os.path.join(directory, "*.csv")):
    try:
        df = pd.read_csv(file)

        total_rows = len(df)
        if total_rows == 0:
            continue  

        # count rows with both conditions
        condition_count = ((df['collectiveAction'] == 0) & (df['race'] == 1)).sum()
        proportion = condition_count / total_rows

        results.append({
            "file": os.path.basename(file),
            "total_rows": total_rows,
            "condition_count": condition_count,
            "proportion": proportion
        })
    except Exception as e:
        print(f"Error processing {file}: {e}")

results_df = pd.DataFrame(results)

summary_stats = results_df[["condition_count", "proportion"]].describe()

### Proportion of statements of each level of collective action per episode

In [None]:
try:
    from scipy import stats
    def t_ci(data, alpha=0.05):
        data = np.asarray(data, dtype=float)
        n = len(data)
        mean = data.mean()
        if n < 2 or np.isclose(data.std(ddof=1), 0):
            return (mean, mean)  
        sem = data.std(ddof=1) / np.sqrt(n)
        tcrit = stats.t.ppf(1 - alpha/2, df=n-1)
        return (mean - tcrit*sem, mean + tcrit*sem)
except Exception:
    def t_ci(data, alpha=0.05):
        data = np.asarray(data, dtype=float)
        n = len(data)
        mean = data.mean()
        if n < 2 or np.isclose(data.std(ddof=1), 0):
            return (mean, mean)
        sem = data.std(ddof=1) / np.sqrt(n)
        z = 1.96 
        return (mean - z*sem, mean + z*sem)

pattern = os.path.join(directory, "*.csv")

required_cols = {"collectiveAction", "race", "collectiveActionLevel"}
all_props = []          
used_files = []         
skipped = []            

for path in glob.glob(pattern):
    try:
        if os.path.getsize(path) == 0:
            skipped.append((path, "empty file (0 bytes)"))
            continue
    except OSError as e:
        skipped.append((path, f"stat error: {e}"))
        continue

    try:
        df = pd.read_csv(path, engine="python", on_bad_lines="skip")
    except pd.errors.EmptyDataError:
        skipped.append((path, "EmptyDataError: no columns/rows"))
        continue
    except Exception as e:
        skipped.append((path, f"read error: {e}"))
        continue

    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        skipped.append((path, f"missing columns: {sorted(missing)}"))
        continue

    sub = df[(df["collectiveAction"] == 0) & (df["race"] == 1)]
    if sub.empty:
        skipped.append((path, "no rows after filter (collectiveAction==0 & race==1)"))
        continue

    props = sub["collectiveActionLevel"].value_counts(normalize=True, dropna=True)

    all_props.append(props)
    used_files.append(path)

if not all_props:
    for p, r in skipped:
        print(f"- {os.path.basename(p)} -> {r}")
else:
    prop_df = pd.DataFrame(all_props).fillna(0.0)
    prop_df.index = [os.path.basename(p) for p in used_files]

    means = prop_df.mean(axis=0)
    cis = {cat: t_ci(prop_df[cat].values) for cat in prop_df.columns}

    out = pd.DataFrame({
        "mean_proportion": means,
        "ci_lower": [cis[c][0] for c in means.index],
        "ci_upper": [cis[c][1] for c in means.index],
        "n_files": len(used_files)
    }).sort_index()

    print("Per-category average proportions and 95% confidence intervals:")
    print(out.round(4))
    print("\nFiles used:")
    for p in used_files:
        print(" -", os.path.basename(p))

    if skipped:
        print("\nFiles skipped (reason):")
        for p, r in skipped:
            print(f" - {os.path.basename(p)} -> {r}")


### Category counts for collective action levels

In [None]:
total_files = 0
total_rows = 0
total_ca0_rows = 0
total_ca0_rj_rows = 0
ca0_percentages = []
ca0_counts_per_file = []

collective_action_level_counts = {}

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        try:
            df = pd.read_csv(filepath, usecols=["collectiveAction", "collectiveActionLevel", "race"])
            total_files += 1
            num_rows = len(df)
            num_ca0 = (df["collectiveAction"] == 0).sum()
            num_rj = (df["race"] == 1).sum()
            num_ca0_rj = ((df["collectiveAction"] == 0) & (df["race"] == 1)).sum()

            total_rows += num_rows
            total_ca0_rj_rows += num_ca0_rj
            total_ca0_rows += num_ca0

            if num_rows > 0:
                ca0_percentages.append(num_ca0_rj / num_rows)
            else:
                ca0_percentages.append(0)
            ca0_counts_per_file.append(num_ca0_rj)

            ca0_subset = df[df["collectiveAction"] == 0]
            level_counts = ca0_subset["collectiveActionLevel"].value_counts(dropna=False)
            for level, count in level_counts.items():
                collective_action_level_counts[level] = collective_action_level_counts.get(level, 0) + count

        except ValueError as ve:
            print(f"Skipping {filename}: Missing expected columns. ({ve})")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

average_ca0_count = sum(ca0_counts_per_file) / total_files if total_files > 0 else 0
average_ca0_percentage = sum(ca0_percentages) / total_files if total_files > 0 else 0

print(f"\nProcessed {total_files} CSV files.")
print(f"Total number of rows in all files: {total_rows:.2f}")
print(f"Total number of rows with collectiveAction: {total_ca0_rows:.2f}")
print(f"Total number of rows with racial justice: {total_ca0_rj_rows:.2f}")
print(f"Average number of rows with collectiveAction per file: {average_ca0_count:.2f}")
print(f"Average percentage of rows with collectiveAction per file: {average_ca0_percentage * 100:.2f}%")
print("\nTotal counts for each collectiveActionLevel:")
for level, count in collective_action_level_counts.items():
    print(f"  {level}: {count}")

### Sequences of collective action statements

In [None]:
import os
import pandas as pd

directory = "../data/episodes"
filtered_rows = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        try:
            df = pd.read_csv(file_path)
            filtered = df[(df['collectiveAction'] == 0) & (df['race'] == 1)]
            if not filtered.empty:
                print(f"Added: {filename}")
                filtered_rows.append(filtered)
        except Exception as e:
            print(f"Error reading {filename}: {e}")

if filtered_rows:
    result_df = pd.concat(filtered_rows, ignore_index=True)
    result_df.to_csv("../data/all_racial_justice_sentences.csv", index=False)