In [1]:
# ============================================================
# Circadian Semantic Exploration Analysis
# Global vs User-level Entropy
# ============================================================
import glob
import numpy as np
import pandas as pd
import faiss
import pytz

from datetime import datetime
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

tqdm.pandas()

In [None]:
# ------------------------------------------------------------
# 0. Load timezone lookup ONCE
# ------------------------------------------------------------

tz_lookup = pd.read_csv("city_country_timezone_lookup.csv")

# ------------------------------------------------------------
# DST-aware UTC → local hour
# ------------------------------------------------------------

def utc_to_local_hour_dst(utc_ts, month, timezone_str):
    if pd.isna(utc_ts) or pd.isna(timezone_str):
        return np.nan
    try:
        tz = pytz.timezone(timezone_str)

        # stabilize DST using mid-month date
        utc_dt = utc_ts.replace(
            year=2024,
            month=int(month),
            day=15
        )

        return utc_dt.astimezone(tz).hour
    except Exception:
        return np.nan

# ------------------------------------------------------------
# Entropy functions
# ------------------------------------------------------------

def individual_entropy(X, k=10):
    X = X.astype("float32")
    index = faiss.IndexFlatL2(X.shape[1])
    index.add(X)
    D, _ = index.search(X, k + 1)
    r_k = D[:, -1]
    return np.log(r_k + 1e-10)


def global_entropy(X, eps=1e-6):
    X = X.astype("float64")
    d = X.shape[1]

    cov = np.cov(X, rowvar=False)
    cov += eps * np.eye(d)

    sign, logdet = np.linalg.slogdet(cov)
    if sign <= 0:
        return np.nan

    return 0.5 * (d * np.log(2 * np.pi * np.e) + logdet)


# ------------------------------------------------------------
# Sentence embedding model
# ------------------------------------------------------------

model = SentenceTransformer(
    "sentence-transformers/all-mpnet-base-v2"
)

batch_size = 8192


# ------------------------------------------------------------
# Monthly processing loop
# ------------------------------------------------------------

for month in ["01","02", "03","04", "05", "06",
              "07", "08", "09", "10", "11", "12"]:

    print(f"\n========== Processing month {month} ==========")

    # ------------------------------------------------------------
    # 1. Load & clean data
    # ------------------------------------------------------------

    chunk_paths = sorted(
        glob.glob(f"./2024/RS_2024-{month}/chunk_*.csv")
    )

    dfs = [pd.read_csv(p) for p in chunk_paths]
    df = pd.concat(dfs, ignore_index=True)
    del dfs

    df["selftext"] = df["selftext"].fillna("")
    df["title"] = df["title"].fillna("")
    df["text"] = df["title"] + ". " + df["selftext"]

    df = df[~df["text"].str.strip().isin(["", "[removed]", "[deleted]"])]
    df = df[~df["author"].str.strip().isin(["", "[removed]", "[deleted]"])]

    df["created_utc"] = pd.to_datetime(
        df["created_utc"], utc=True, errors="coerce"
    )
    df = df.dropna(subset=["created_utc"])

    # Preserve UTC metadata ONLY
    df["utc_hour"] = df["created_utc"].dt.hour
    df["month"] = int(month)

    print(f"Posts after cleaning: {len(df)}")

    # ------------------------------------------------------------
    # 2. Merge timezone lookup
    # ------------------------------------------------------------

    df = df.merge(
        tz_lookup,
        on=["loc_city", "loc_country"],
        how="left"
    )

    # ------------------------------------------------------------
    # 3. Compute LOCAL HOUR (DST-aware)
    # ------------------------------------------------------------

    df["local_hour"] = df.progress_apply(
        lambda r: utc_to_local_hour_dst(
            r["created_utc"],
            r["month"],
            r["timezone"]
        ),
        axis=1
    )

    df = df.dropna(subset=["local_hour"])
    df["local_hour"] = df["local_hour"].astype(int)

    # ------------------------------------------------------------
    # 4. Embed text
    # ------------------------------------------------------------

    embeddings = []

    for i in tqdm(
        range(0, len(df), batch_size),
        desc="Embedding text"
    ):
        batch = df["text"].iloc[i:i + batch_size].tolist()
        emb = model.encode(
            batch,
            normalize_embeddings=True,
            device="cuda"
        )
        embeddings.append(emb)

    df["embedding"] = np.vstack(embeddings).tolist()
    print("Embedding complete.")

    # ------------------------------------------------------------
    # 5. ENTROPY BY *LOCAL HOUR* (ONLY)
    # ------------------------------------------------------------

    individual_results = []
    global_results = []

    for (country, h), sub in tqdm(
        df.groupby(["loc_country", "local_hour"]),
        desc="Entropy per country × LOCAL hour"
    ):
        if len(sub) < 10:   # avoid unstable covariance
            continue

        X = np.vstack(sub["embedding"].values)

        # --- 1. Global entropy
        glob_ent = global_entropy(X)
        global_results.append({
            "month": month,
            "loc_country": country,
            "local_hour": h,
            "global_entropy": glob_ent,
            "n_posts": len(sub)
        })

        # --- 2. Individual post entropy
        ind_ent = individual_entropy(X, k=10)  

        # Save individual entropy
        for idx, e in zip(sub.index, ind_ent):
            individual_results.append({
                "post_idx": idx,
                "author": sub.loc[idx, "author"],
                "local_hour": sub.loc[idx, "local_hour"],
                "created_utc": sub.loc[idx, "created_utc"],
                "utc_hour": sub.loc[idx, "utc_hour"],
                "month": month,
                "individual_entropy": e,
                "loc_city": sub.loc[idx, "loc_city"],
                "loc_country": sub.loc[idx, "loc_country"],
                "timezone": sub.loc[idx, "timezone"],
                "sentiment_compound": sub.loc[idx, "sentiment_compound"]
                if "sentiment_compound" in sub.columns else None
            })

    # ------------------------------------------------------------
    # 6. Save CSVs
    # ------------------------------------------------------------

    pd.DataFrame(individual_results).to_csv(
        f"individual_post_entropy_LOCAL_DST_{month}.csv",
        index=False
    )

    pd.DataFrame(global_results).to_csv(
        f"global_entropy_LOCAL_hour_DST_{month}.csv",
        index=False
    )

    print(f"Saved LOCAL-time entropy CSVs for month {month}")




Posts after cleaning: 366778


100%|██████████| 366778/366778 [00:08<00:00, 42311.16it/s]
Embedding text: 100%|██████████| 42/42 [29:15<00:00, 41.81s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2349/2349 [03:03<00:00, 12.77it/s] 


Saved LOCAL-time entropy CSVs for month 01

Posts after cleaning: 345913


100%|██████████| 345913/345913 [00:07<00:00, 45241.26it/s]
Embedding text: 100%|██████████| 39/39 [29:12<00:00, 44.94s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2340/2340 [02:59<00:00, 13.05it/s] 


Saved LOCAL-time entropy CSVs for month 02

Posts after cleaning: 399769


100%|██████████| 399769/399769 [00:12<00:00, 32599.31it/s]
Embedding text: 100%|██████████| 45/45 [33:41<00:00, 44.93s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2414/2414 [03:07<00:00, 12.89it/s] 


Saved LOCAL-time entropy CSVs for month 03

Posts after cleaning: 395468


100%|██████████| 395468/395468 [00:10<00:00, 36462.57it/s]
Embedding text: 100%|██████████| 46/46 [32:50<00:00, 42.84s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2382/2382 [03:08<00:00, 12.61it/s] 


Saved LOCAL-time entropy CSVs for month 04

Posts after cleaning: 408093


100%|██████████| 408093/408093 [00:11<00:00, 36837.91it/s]
Embedding text: 100%|██████████| 47/47 [32:05<00:00, 40.96s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2383/2383 [03:13<00:00, 12.29it/s]


Saved LOCAL-time entropy CSVs for month 05

Posts after cleaning: 384209


100%|██████████| 384209/384209 [00:10<00:00, 36343.46it/s]
Embedding text: 100%|██████████| 44/44 [29:15<00:00, 39.89s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2439/2439 [03:12<00:00, 12.65it/s] 


Saved LOCAL-time entropy CSVs for month 06

Posts after cleaning: 408035


100%|██████████| 408035/408035 [00:11<00:00, 36976.32it/s]
Embedding text: 100%|██████████| 47/47 [30:57<00:00, 39.52s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2450/2450 [03:24<00:00, 12.00it/s] 


Saved LOCAL-time entropy CSVs for month 07

Posts after cleaning: 419450


100%|██████████| 419450/419450 [00:11<00:00, 36854.15it/s]
Embedding text: 100%|██████████| 48/48 [31:36<00:00, 39.50s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2487/2487 [03:42<00:00, 11.19it/s] 


Saved LOCAL-time entropy CSVs for month 08

Posts after cleaning: 415302


100%|██████████| 415302/415302 [00:11<00:00, 35646.42it/s]
Embedding text: 100%|██████████| 47/47 [30:39<00:00, 39.15s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2443/2443 [03:24<00:00, 11.94it/s] 


Saved LOCAL-time entropy CSVs for month 09

Posts after cleaning: 450065


100%|██████████| 450065/450065 [00:12<00:00, 37155.49it/s]
Embedding text: 100%|██████████| 52/52 [33:23<00:00, 38.52s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2498/2498 [03:29<00:00, 11.90it/s] 


Saved LOCAL-time entropy CSVs for month 10

Posts after cleaning: 407896


100%|██████████| 407896/407896 [00:11<00:00, 35729.28it/s]
Embedding text: 100%|██████████| 47/47 [31:39<00:00, 40.42s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2466/2466 [03:23<00:00, 12.11it/s] 


Saved LOCAL-time entropy CSVs for month 11

Posts after cleaning: 398235


100%|██████████| 398235/398235 [00:10<00:00, 36209.61it/s]
Embedding text: 100%|██████████| 46/46 [31:45<00:00, 41.43s/it]


Embedding complete.


Entropy per country × LOCAL hour: 100%|██████████| 2515/2515 [03:21<00:00, 12.47it/s] 


Saved LOCAL-time entropy CSVs for month 12
