In [1]:
import json
from collections import Counter, defaultdict
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
import seaborn as sns
from scipy.stats import shapiro, normaltest, kstest
import scipy.stats as stats

## Add Label Period and Mainstream

In [None]:
# === Load Excel file and create lookup dictionaries ===
excel_df = pd.read_excel('../articles_584.xlsx', usecols=['id', 'period', 'media_type'])

# Create mapping dictionaries
id_to_period = dict(zip(excel_df['id'], excel_df['period']))
id_to_media_type = dict(zip(excel_df['id'], excel_df['media_type']))

# === Load original JSON file ===
with open('step2_all.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# === Enrich each article with 'period' and 'media_type' ===
for article_key, article_content in json_data.items():
    try:
        article_id = int(article_key.split('_')[1])
    except (IndexError, ValueError):
        print(f"⚠️ Skipping malformed key: {article_key}")
        continue

    if isinstance(article_content, dict):
        article_content['period'] = id_to_period.get(article_id)
        article_content['media_type'] = id_to_media_type.get(article_id, 'unknown')
    else:
        print(f"⚠️ Skipping {article_key} — not a dictionary.")

# === Save enriched JSON to file ===
with open('step2_all_with_periods_media.json', 'w', encoding='utf-8') as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False)

print("✅ Enriched JSON saved to 'step2_all_with_periods_media.json'.")


⚠️ Skipping Article_584 — not a dictionary.
⚠️ Skipping Article_589 — not a dictionary.
⚠️ Skipping Article_613 — not a dictionary.
⚠️ Skipping Article_621 — not a dictionary.
⚠️ Skipping Article_630 — not a dictionary.
✅ Enriched JSON saved to 'step2_all_with_periods_media.json'.


## Convert json to Pivot Table

In [None]:
# Load the updated JSON with period information
with open('step2_all_with_periods_media.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# List to hold per-article rows
article_rows = []

# Loop over each article
for article_id, article_data in data.items():
    if not isinstance(article_data, dict):
        continue

    entity_type_counter = Counter()
    period = article_data.get('period', None)  # get period if exists

    # Count entity types
    for entity_key, entity in article_data.items():
        if entity_key == 'period' or not isinstance(entity, dict):
            continue
        entity_type = entity.get('entity_type')
        if entity_type:
            entity_type_counter[entity_type] += 1

    # Build row with entity_type counts and period
    row = {'article_id': article_id, 'period': period}
    row.update(entity_type_counter)
    article_rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(article_rows)

# Replace missing values with 0 for entity_type columns
entity_type_cols = [col for col in df.columns if col not in ['article_id', 'period']]
df[entity_type_cols] = df[entity_type_cols].fillna(0).astype(int)

# Save to CSV
csv_output = 'sources_per_article.csv'
df.to_csv(csv_output, index=False)
print(f"✅ CSV saved: {csv_output}")

# Save to Excel
excel_output = 'sources_per_article.xlsx'
df.to_excel(excel_output, index=False)
print(f"✅ Excel saved: {excel_output}")
