In [1]:
import pandas as pd
import numpy as np
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Cross-sectional GSS data processing
# Load GSS overall data
gss = pd.read_excel("/Users/wooyongjung/WJ_Projects/LLM_POC_Study_2025_v2/data/GSS/GSS.xlsx")

# Create yearid by combining year and id
gss['yearid'] = gss['year'].astype(str) + "_" + gss['id_'].astype(str)

# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS = ["abdefect","abnomore","abany","abhlth","abpoor","abrape","absingle"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Following Rossi Scale

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Count number of non-missing responses
    out["n_nonmiss"] = out[binary_items].notna().sum(axis=1)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, "year", "n_yes", "n_nonmiss", "abortion_att4"]]


# Construct binary items and collapse into attitude categories
gss_abt_bin = prepare_binary(gss)
gss_abt_bin = collapse_abortion_attitudes(gss_abt_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS])

# Required columns (demographic) from gss_2020
required_columns = ["cohort", "degree", "race", "sex", "polviews", "natenvir", "trust", "homosex", "wtssps"] # exclude relig, marital (no data for GSS 2024)

# Merge demographic columns
gss_abt_cs = gss_abt_bin.merge(gss[required_columns + ["yearid"]], on="yearid", how="left")

# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    elif year.startswith('.'):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"

gss_abt_cs['generation'] = gss_abt_cs['cohort'].apply(determine_generation)


# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    elif edu in ["Less than high school", "High school"]:
        return "Less or equal to high school"
    elif edu in ["Associate/junior college", "Bachelor's"]:
        return "Associate or Bachelor's Degree"
    elif edu in ["Graduate"]:
        return "Graduate Degree"
    else:
        return None 

gss_abt_cs['edu_level'] = gss_abt_cs['degree'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == "MALE":
        return "Male"
    elif gender == "FEMALE":
        return "Female"
    else:
        return "Other"

gss_abt_cs['gender'] = gss_abt_cs['sex'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == "White":
        return "White"
    elif race == "Black":
        return "Black"
    elif race == "Other":
        return "Other"
    else:
        return None

gss_abt_cs['race'] = gss_abt_cs['race'].apply(categorize_race)

# # Create religion column
# def categorize_religion(relig):
#     if relig in ["Protestant", "Catholic", "None"]:
#         return relig
#     elif pd.isna(relig):
#         return None
#     else:
#         return "Other"

# gss_abt_cs['religion'] = gss_abt_cs['relig'].apply(categorize_religion)

# Create political views column
def categorize_political_views(pv):
    if pd.isna(pv):
        return None
    elif pv in ["Extremely liberal", "Liberal", "Slightly liberal"]:
        return "Liberal"
    elif pv in ["Moderate, middle of the road"]:
        return "Moderate"
    elif pv in ["Slightly conservative", "Conservative", "Extremely conservative"]:
        return "Conservative"
    else:
        return None

gss_abt_cs['political_views'] = gss_abt_cs['polviews'].apply(categorize_political_views)


# Create environmental attitude column
def categorize_environmental_attitude(env):
    if pd.isna(env):
        return None
    if env in ["TOO LITTLE"]:
        return "too_little"
    elif env in ["ABOUT RIGHT"]:
        return "about_right"
    elif env in ["TOO MUCH"]:
        return "too_much"
    else:
        return None
    
gss_abt_cs['natenvir'] = gss_abt_cs['natenvir'].apply(categorize_environmental_attitude)

# Create trust other people column
def categorize_trust(trust):
    if pd.isna(trust):
        return None
    if trust in ["Most people can be trusted"]:
        return "trust"
    elif trust in ["Can't be too careful"]:
        return "distrust"
    elif trust in ["Depends"]:
        return "depends"
    else:
        return None

gss_abt_cs['trust'] = gss_abt_cs['trust'].apply(categorize_trust)

# Create homosexuality attitude column
def categorize_homosexuality_attitude(homosex):
    if pd.isna(homosex):
        return None
    if homosex in ["ALWAYS WRONG"]:
        return "always_wrong"
    elif homosex in ["ALMST ALWAYS WRG"]:
        return "almost_always_wrong"
    elif homosex in ["SOMETIMES WRONG"]:
        return "sometimes_wrong"
    elif homosex in ["NOT WRONG AT ALL"]:
        return "not_wrong_at_all"
    else:
        return None

gss_abt_cs['homosex'] = gss_abt_cs['homosex'].apply(categorize_homosexuality_attitude)

In [3]:
# Panel data processing
# Load panel data (three-wave rollings)
gss_2010 = pd.read_stata('data/GSS/GSS_panel2010w123_R6.dta', convert_categoricals=False)
gss_2008 = pd.read_stata('data/GSS/GSS_panel2008w123_r6 .dta', convert_categoricals=False)
gss_2006 = pd.read_stata('data/GSS/GSS_panel2006w123_r6a.dta', convert_categoricals=False)

# Create a "yearid" column by combining "2010" and index
gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)

# Select relevant columns for analysis
required_columns = ["cohort_1", "degree_1", "degree_2", "degree_3", "race_1", "sex_1", "polviews_1", "polviews_2", "polviews_3"]
abortion_items = ["abdefect_1", "abnomore_1", "abany_1", "abhlth_1", "abpoor_1", "abrape_1", "absingle_1",
                 "abdefect_2", "abnomore_2", "abany_2", "abhlth_2", "abpoor_2", "abrape_2", "absingle_2",
                 "abdefect_3", "abnomore_3", "abany_3", "abhlth_3", "abpoor_3", "abrape_3", "absingle_3"]
trust_columns = ["trust_1", "trust_2", "trust_3"]
natenvir_columns = ["natenvir_1", "natenvir_2", "natenvir_3"]
homosex_columns = ["homosex_1", "homosex_2", "homosex_3"]

gss_2010 = gss_2010[['yearid'] +required_columns + abortion_items + trust_columns + natenvir_columns + homosex_columns]
gss_2008 = gss_2008[['yearid'] + required_columns + abortion_items + trust_columns + natenvir_columns + homosex_columns]
gss_2006 = gss_2006[['yearid'] + required_columns + abortion_items + trust_columns + natenvir_columns + homosex_columns]

# Rename columns for consistency
gss_2010.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)
gss_2008.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)
gss_2006.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)

  gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
  gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
  gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)


In [4]:
# Convert dfs to long format: _1 → 2010, _2 → 2012, _3 → 2014
gss_2010_long = pd.wide_to_long(gss_2010, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree', 'polviews', 'natenvir', 'trust', 'homosex'],
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2008_long = pd.wide_to_long(gss_2008, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree', 'polviews', 'natenvir', 'trust', 'homosex'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2006_long = pd.wide_to_long(gss_2006, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree', 'polviews', 'natenvir', 'trust', 'homosex'],
                                i='yearid', j='wave', suffix='_\\d+').reset_index()

# Create year column based on wave
def map_wave_to_year(wave, base_year):
    if wave == "_1":
        return base_year
    elif wave == "_2":
        return base_year + 2
    elif wave == "_3":
        return base_year + 4
    else:
        return None

gss_2010_long['year'] = gss_2010_long['wave'].apply(lambda w: map_wave_to_year(w, 2010))
gss_2008_long['year'] = gss_2008_long['wave'].apply(lambda w: map_wave_to_year(w, 2008))
gss_2006_long['year'] = gss_2006_long['wave'].apply(lambda w: map_wave_to_year(w, 2006))

In [5]:
# Combine long dfs
gss_abt_panel = pd.concat([gss_2010_long, gss_2008_long, gss_2006_long], ignore_index=True)
gss_abt_panel.shape

(18201, 18)

In [6]:
# Load 2020 panel data
gss_2020 = pd.read_stata('data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)

# Expect one row per respondent per wave, columns listed below:
ITEMS_2016 = ["abdefect_1a","abnomore_1a","abany_1a","abhlth_1a","abpoor_1a","abrape_1a","absingle_1a","natenvir_1a","trust_1a","homosex_1a"]
ITEMS_2018 = ["abdefect_1b","abnomore_1b","abany_1b","abhlth_1b","abpoor_1b","abrape_1b","absingle_1b","natenvir_1b","trust_1b","homosex_1b"]
ITEMS_2020 = ["abdefect_2","abnomore_2","abany_2","abhlth_2","abpoor_2","abrape_2","absingle_2","natenvir_2","trust_2","homosex_2"]

# Split gss_2020 into 2016->2020 and 2018->2020 waves
gss_2016 = gss_2020[gss_2020['samptype'] == 2016]
gss_2018 = gss_2020[gss_2020['samptype'] == 2018]

# Required columns (demographic) from gss_2020
required_columns_2016 = ["cohort_1a", "degree_1a", "race_1a", "gender1_1a", "polviews_1a", "degree_2", "polviews_2"]
required_columns_2018 = ["cohort_1b", "degree_1b", "race_1b", "gender1_1b", "polviews_1b", "degree_2", "polviews_2"]

gss_2016 = gss_2016[['yearid'] + required_columns_2016 + ITEMS_2016 + ITEMS_2020]
gss_2018 = gss_2018[['yearid'] + required_columns_2018 + ITEMS_2018 + ITEMS_2020]

# Rename 1a and 1b
gss_2016.rename(columns={"gender1_1a": "gender", "race_1a": "race", "degree_1a": "degree_1", "cohort_1a": "cohort", "polviews_1a": "polviews_1", "natenvir_1a": "natenvir_1", "trust_1a": "trust_1","homosex_1a": "homosex_1",
"abdefect_1a": "abdefect_1", "abnomore_1a": "abnomore_1", "abany_1a": "abany_1", "abhlth_1a": "abhlth_1", "abpoor_1a": "abpoor_1", "abrape_1a": "abrape_1", "absingle_1a": "absingle_1"}, inplace=True)
gss_2018.rename(columns={"gender1_1b": "gender", "race_1b": "race", "degree_1b": "degree_1", "cohort_1b": "cohort1", "polviews_1b": "polviews_1", "natenvir_1b": "natenvir_1", "trust_1b": "trust_1","homosex_1b": "homosex_1",
"abdefect_1b": "abdefect_1", "abnomore_1b": "abnomore_1", "abany_1b": "abany_1", "abhlth_1b": "abhlth_1", "abpoor_1b": "abpoor_1", "abrape_1b": "abrape_1", "absingle_1b": "absingle_1"}, inplace=True)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  gss_2020 = pd.read_stata('data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)


In [7]:
# Convert to long format for analysis
gss_2016_long = pd.wide_to_long(gss_2016, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree', 'polviews', 'natenvir', 'trust', 'homosex'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2018_long = pd.wide_to_long(gss_2018, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree', 'polviews', 'natenvir', 'trust', 'homosex'],
                                i='yearid', j='wave', suffix='_\\d+').reset_index()

# Create year column based on wave
def map_wave_to_year(wave, last_year, gap):
    if wave == "_1":
        return last_year - gap
    elif wave == "_2":
        return last_year
    else:
        return None

gss_2016_long['year'] = gss_2016_long['wave'].apply(lambda w: map_wave_to_year(w, 2020, 4))
gss_2018_long['year'] = gss_2018_long['wave'].apply(lambda w: map_wave_to_year(w, 2020, 2))


In [8]:
# Combine long dfs
gss_abt_panel = pd.concat([gss_2016_long, gss_2018_long, gss_abt_panel], ignore_index=True)
gss_abt_panel.shape

(28631, 19)

In [9]:
# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS = ["abdefect","abnomore","abany","abhlth","abpoor","abrape","absingle"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Following Rossi Scale

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Count number of non-missing responses (for reference)
    out["n_nonmiss"] = out[binary_items].notna().sum(axis=1)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, "year", "n_yes", "n_nonmiss", "abortion_att4", "cohort", "degree", "race", "gender", "polviews", "natenvir", "trust", "homosex"]]


# Construct binary items and collapse into attitude categories
gss_abt_panel = prepare_binary(gss_abt_panel)
gss_abt_panel = collapse_abortion_attitudes(gss_abt_panel, binary_items=[col+"_bin" for col in BINARY_ITEMS])

In [10]:
# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"
        
gss_abt_panel['generation'] = gss_abt_panel['cohort'].apply(determine_generation)

# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    elif edu <= 1:
        return "Less or equal to high school"
    elif edu == 2 or edu == 3:
        return "Associate or Bachelor's Degree"
    elif edu == 4:
        return "Graduate Degree"
    else:
        return None

gss_abt_panel['edu_level'] = gss_abt_panel['degree'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == 1:
        return "Male"
    elif gender == 2:
        return "Female"
    else:
        return "Other"

gss_abt_panel['gender'] = gss_abt_panel['gender'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == 1:
        return "White"
    elif race == 2:
        return "Black"
    elif race == 3:
        return "Other"
    else:
        return "Unknown"

gss_abt_panel['race'] = gss_abt_panel['race'].apply(categorize_race)

# Create political views column
def categorize_political_views(pv):
    if pd.isna(pv):
        return None
    elif pv == 1 or pv == 2 or pv == 3:
        return "Liberal"
    elif pv == 4:
        return "Moderate"
    elif pv == 5 or pv == 6 or pv == 7:
        return "Conservative"
    else:
        return None

gss_abt_panel['political_views'] = gss_abt_panel['polviews'].apply(categorize_political_views)

# Create environmental attitude column
def categorize_environmental_attitude(env):
    if pd.isna(env):
        return None
    if env == 1:
        return "too_little"
    elif env == 2:
        return "about_right"
    elif env == 3:
        return "too_much"
    else:
        return None

gss_abt_panel['natenvir'] = gss_abt_panel['natenvir'].apply(categorize_environmental_attitude)

# Create trust other people column
def categorize_trust(trust):
    if pd.isna(trust):
        return None
    elif trust == 1:
        return "trust"
    elif trust == 2:
        return "distrust"
    elif trust == 3:
        return "depends"
    else:
        return None

gss_abt_panel['trust'] = gss_abt_panel['trust'].apply(categorize_trust)

# Create homosexuality attitude column
def categorize_homosexuality_attitude(homosex):
    if pd.isna(homosex):
        return None
    if homosex == 1:
        return "always_wrong"
    elif homosex == 2:
        return "almost_always_wrong"
    elif homosex == 3:
        return "sometimes_wrong"
    elif homosex == 4:
        return "not_wrong_at_all"
    else:
        return None
gss_abt_panel['homosex'] = gss_abt_panel['homosex'].apply(categorize_homosexuality_attitude)

In [14]:
# # Drop rows with n_nonmiss < 7
gss_abt_cs_abortion = gss_abt_cs[gss_abt_cs['n_nonmiss'] == 7]
gss_abt_panel_abortion = gss_abt_panel[gss_abt_panel['n_nonmiss'] == 7]

# Drop rows with missing homosex
gss_abt_cs_homosex = gss_abt_cs.dropna(subset=['homosex'])
gss_abt_panel_homosex = gss_abt_panel.dropna(subset=['homosex'])

# Finally add needed columns and save
df_cs_abo = gss_abt_cs_abortion[['yearid', 'year', 'generation', 'edu_level', 'gender', 'race', 'political_views', 'natenvir', 'trust', 'abortion_att4', 'homosex', 'wtssps']].copy()
df_pl_abo = gss_abt_panel_abortion[['yearid', 'year', 'generation', 'edu_level', 'gender', 'race', 'political_views', 'natenvir', 'trust', 'abortion_att4', 'homosex']].copy()

df_cs_homosex = gss_abt_cs_homosex[['yearid', 'year', 'generation', 'edu_level', 'gender', 'race', 'political_views', 'natenvir', 'trust', 'abortion_att4', 'homosex', 'wtssps']].copy()
df_pl_homosex = gss_abt_panel_homosex[['yearid', 'year', 'generation', 'edu_level', 'gender', 'race', 'political_views', 'natenvir', 'trust', 'abortion_att4', 'homosex']].copy()

In [15]:
df_cs_abo.shape, df_pl_abo.shape, df_cs_homosex.shape, df_pl_homosex.shape

((13351, 12), (12610, 11), (17576, 12), (13647, 11))

In [19]:
# Convert year 2021 to 2020
df_cs_abo.loc[df_cs_abo['year'] == 2021, 'year'] = 2020
df_cs_homosex.loc[df_cs_homosex['year'] == 2021, 'year'] = 2020


In [None]:
# Multinomial logistic AR analysis
import numpy as np, pandas as pd
from sklearn.linear_model import Ridge

# ALR helpers (reference = last category)
def alr(p, eps=1e-8):
    p = np.clip(np.asarray(p, dtype=float), eps, 1.0)
    p = p / p.sum()
    ref = p[-1]
    return np.log(p[:-1] / ref)

def inv_alr(u):
    u = np.asarray(u, dtype=float)
    z = np.r_[np.exp(u), 1.0]
    return z / z.sum()

# Fit AR(1) per subgroup with ridge, forecast next year
def cs_ar_fit_and_forecast(p_cs, l2=1.0, min_years=3):
    """
    p_cs: dict {((group_tuple), year) -> p (K,)}
    Returns dict {((group_tuple), year_next) -> p_hat_AR}
    """
    by_group = {}
    for (g,y), p in p_cs.items():
        by_group.setdefault(g, []).append(int(y))

    forecasts = {}
    for g, ys in by_group.items():
        ys = sorted(ys)
        if len(ys) < min_years:
            # too few years: fallback to persistence p_{t+1} = p_t
            for i in range(1, len(ys)):
                y_prev, y = ys[i-1], ys[i]
                forecasts[(g, y)] = p_cs[(g, y_prev)]
            continue

        # Build (X=u_{t-1}, Y=u_t) in ALR space
        X_list, Y_list, targets = [], [], []
        for i in range(1, len(ys)):
            y_prev, y = ys[i-1], ys[i]
            u_prev = alr(p_cs[(g, y_prev)])
            u_curr = alr(p_cs[(g, y)])
            X_list.append(np.r_[1.0, u_prev])   # intercept + u_{t-1}
            Y_list.append(u_curr)
            targets.append((g, y))              # we’ll also produce in-sample one-step fits if needed
        X = np.vstack(X_list)        # [T-1, 1+(K-1)]
        Y = np.vstack(Y_list)        # [T-1, (K-1)]

        # Fit separate Ridge for each ALR dimension (small T)
        coefs = []
        for k in range(Y.shape[1]):
            reg = Ridge(alpha=l2, fit_intercept=False)  # intercept already in X
            reg.fit(X, Y[:,k])
            coefs.append(reg.coef_)
        B = np.vstack(coefs)  # shape [(K-1) x (K)]

        # Forecast each step from previous observed (one-step ahead backtest)
        for i in range(1, len(ys)):
            y_prev, y = ys[i-1], ys[i]
            u_prev = alr(p_cs[(g, y_prev)])
            x = np.r_[1.0, u_prev]             # [K]
            u_hat = B @ x                      # [K-1]
            p_hat = inv_alr(u_hat)
            forecasts[(g, y)] = p_hat

    return forecasts


