In [1]:
import sys, os
import pandas as pd
import numpy as np

# Ensure project root is on path
sys.path.append("/Users/lei/Documents/Sequenzo_all_folders/glm_plus")

from glm_plus.frequentist.torque import FrequentistOQR

In [2]:
df_seniority = pd.read_csv("/Users/lei/Documents/Sequenzo_all_folders/sequenzo_local/test_data/real_data_my_paper/detailed_senority_levels_for_10_years.csv")
df_seniority

Unnamed: 0,worker_id,gender,country,cohort,highest_educational_degree,whether_bachelor_university_prestigious,internationalization,work_years,company_size,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10
0,dilip-kumar-nath-7538746a,male,india,11-20 total work years,Bachelor,False,Multinational,15.250,"1,001-5,000 employees",Regular,Regular,Regular,Regular,Senior,Senior,Senior,Senior,Senior,Senior
1,vinodkumar-yadav-b7a976107,male,india,11-20 total work years,Bachelor,False,Local,11.500,"5,001-10,000 employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular
2,rajani-kulkarni-72674649,female,india,11-20 total work years,Bachelor,False,Multinational,11.250,"10,001+ employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Leader,Leader,Regular
3,josh-slosson-07a1509,male,india,11-20 total work years,Bachelor,False,International,18.417,11-50 employees,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular
4,jonatthan-rodriguez-8a5612121,male,india,11-20 total work years,Bachelor,False,Multinational,11.917,"5,001-10,000 employees",Regular,Regular,Senior,Senior,Senior,Senior,Senior,Senior,Senior,Senior
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32623,edwardgu1915,male,us,0-10 total work years,Master,False,Multinational,10.499,"501-1,000 employees",Leader,Leader,Leader,Leader,Leader,Senior,Senior,Senior,Regular,Senior
32624,terry-ferguson-8060363a,male,us,21-30 total work years,Bachelor,False,Local,21.001,"10,001+ employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular
32625,christy-garner-677894213,female,us,21-30 total work years,Bachelor,False,Multinational,27.250,"501-1,000 employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular
32626,jay-p-119369202,male,us,21-30 total work years,Bachelor,False,Local,23.417,"10,001+ employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular


In [3]:
# Wide-format cleaning: collapse seniority labels into 4 levels
# - Assistant + Junior → Assistant/Junior
# - Regular → Regular
# - Senior/Leader/Lead → Leader
# - Chief or Founder (+ VP/Director/C-level variants) → Chief/Founder

# Detect Y columns
Y_cols = [c for c in df_seniority.columns if str(c).upper().startswith('Y')]
assert len(Y_cols) > 0, 'No Y1..Yk columns found'

collapse_order = ["Assistant/Junior", "Regular", "Leader", "Chief/Founder"]

def _collapse_label(v):
    if pd.isna(v):
        return np.nan
    t = str(v).strip().lower()
    if t in {"assistant", "intern", "entry level"}:
        return "Assistant/Junior"
    if t in {"junior", "jr"}:
        return "Assistant/Junior"
    if t in {"regular", "mid", "middle", "intermediate", "associate"}:
        return "Regular"
    if t in {"senior", "leader", "lead", "sr"}:
        return "Leader"
    if t in {"chief or founder", "chief/founder", "chief", "c-level", "founder", "executive", "vp", "vice president", "director"}:
        return "Chief/Founder"
    # keep original if already one of target labels
    if v in collapse_order:
        return v
    return v

for col in Y_cols:
    df_seniority[col] = df_seniority[col].apply(_collapse_label)

# Quick check of levels after collapsing
level_counts = pd.DataFrame({c: df_seniority[c].value_counts(dropna=False) for c in Y_cols})
print(level_counts.sum(axis=1).sort_index())

df_seniority.head()


Assistant/Junior     11110
Chief/Founder         2504
Leader              114640
Regular             198026
dtype: int64


Unnamed: 0,worker_id,gender,country,cohort,highest_educational_degree,whether_bachelor_university_prestigious,internationalization,work_years,company_size,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10
0,dilip-kumar-nath-7538746a,male,india,11-20 total work years,Bachelor,False,Multinational,15.25,"1,001-5,000 employees",Regular,Regular,Regular,Regular,Leader,Leader,Leader,Leader,Leader,Leader
1,vinodkumar-yadav-b7a976107,male,india,11-20 total work years,Bachelor,False,Local,11.5,"5,001-10,000 employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular
2,rajani-kulkarni-72674649,female,india,11-20 total work years,Bachelor,False,Multinational,11.25,"10,001+ employees",Regular,Regular,Regular,Regular,Regular,Regular,Regular,Leader,Leader,Regular
3,josh-slosson-07a1509,male,india,11-20 total work years,Bachelor,False,International,18.417,11-50 employees,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular,Regular
4,jonatthan-rodriguez-8a5612121,male,india,11-20 total work years,Bachelor,False,Multinational,11.917,"5,001-10,000 employees",Regular,Regular,Leader,Leader,Leader,Leader,Leader,Leader,Leader,Leader


In [4]:
df_seniority.to_csv("df_seniority.csv", index=False)

In [5]:
# 横截面：以 Y10 为因变量（有序），gender 为核心自变量，控制：
# highest_educational_degree, whether_bachelor_university_prestigious,
# internationalization, work_years, company_size

year_col = 'Y10'
order_labels = ["Assistant/Junior", "Regular", "Leader", "Chief/Founder"]

use_cols = [
    'gender',
    'highest_educational_degree',
    'whether_bachelor_university_prestigious',
    'internationalization',
    'work_years',
    'company_size',
    year_col,
]

df_cs_full = df_seniority[use_cols].dropna().copy()

# y: 映射到 1..J
cat_type = pd.CategoricalDtype(categories=order_labels, ordered=True)
df_cs_full['y_cat'] = df_cs_full[year_col].astype(cat_type)
df_cs_full['y'] = df_cs_full['y_cat'].cat.codes + 1

# 性别：female=1, male=0
_df_gender = df_cs_full['gender'].astype(str).str.strip().str.lower()
df_cs_full['female'] = (_df_gender == 'female').astype(int)

# 是否名校本科学位：转为 0/1
col_prestige = 'whether_bachelor_university_prestigious'
if str(df_cs_full[col_prestige].dtype) == 'bool':
    df_cs_full['prestigious_bachelor'] = df_cs_full[col_prestige].astype(int)
else:
    df_cs_full['prestigious_bachelor'] = (
        df_cs_full[col_prestige].astype(str).str.strip().str.lower().isin(['true','1','yes','y','t'])
    ).astype(int)

# 数值特征
df_cs_full['work_years'] = pd.to_numeric(df_cs_full['work_years'], errors='coerce')

# 类别特征 one-hot（drop_first 保留基准类）
cat_cols = ['highest_educational_degree', 'internationalization', 'company_size']
X_cat = pd.get_dummies(df_cs_full[cat_cols], columns=cat_cols, drop_first=True, dtype=int)

# 组合设计矩阵：先放 gender，再放控制
X_df = pd.concat([
    df_cs_full[['female', 'prestigious_bachelor', 'work_years']].reset_index(drop=True),
    X_cat.reset_index(drop=True)
], axis=1)

feature_cols = list(X_df.columns)
X = X_df.to_numpy(dtype=float)
y = df_cs_full['y'].to_numpy(dtype=int)

len(df_cs_full), X.shape, y.shape, np.unique(y)


(32435, (32435, 19), (32435,), array([1, 2, 3, 4]))

In [None]:
df_cs_full.to_csv("df_formatted.csv", index=False)

In [None]:
# 拟合 TORQUE 频率学 OQR（横截面）
model = FrequentistOQR(
    quantiles=(0.2, 0.5, 0.8),
    use_two_index=True,
    auto_select_k=True,
    subsample_n=5000 if len(df_cs_full) > 20000 else (3000 if len(df_cs_full) > 2000 else None),
    random_state=0,
)
model.fit(X, y)

info = model.summary()
info['selected_k'], info['selected_k_full'], info.get('corr_xbeta1_xbeta2', None)


In [None]:
# 稳健性：用不同控制向量的代表点，以及用预测“区间边界”来度量顶部/底部差异
# 方案A：改变代表点（把连续变量换成四分位点、类别换成其它基类），看结论是否稳定
# 方案B：使用预测区间边界的 male-female 差值（lo/hi），与连续分位点的结论是否一致

from itertools import product

control_summaries = {
    'work_years': [X_df['work_years'].quantile(0.25), X_df['work_years'].median(), X_df['work_years'].quantile(0.75)]
}

# 类别基类切换：把 one-hot 的 1 位置挪到别的列上（通过构造一个行向量）
def make_ref_row(female_val: int, base_choices: dict[str, str]):
    row = {c: 0.0 for c in X_df.columns}
    row['female'] = float(female_val)
    row['prestigious_bachelor'] = float(X_df['prestigious_bachelor'].median())
    row['work_years'] = float(base_choices.get('work_years', X_df['work_years'].median()))
    # 对每个类别组，找以该名称开头的列，保持全 0 即表示基准类
    for c in X_df.columns:
        if c.startswith('highest_educational_degree_'):
            pass
        if c.startswith('internationalization_'):
            pass
        if c.startswith('company_size_'):
            pass
    return pd.DataFrame([row])

# 取几个代表点
ref_rows_0 = [make_ref_row(0, {'work_years': wy}) for wy in control_summaries['work_years']]
ref_rows_1 = [make_ref_row(1, {'work_years': wy}) for wy in control_summaries['work_years']]

qs = (0.1, 0.25, 0.5, 0.75, 0.9)
gap_sets = []
for r0, r1 in zip(ref_rows_0, ref_rows_1):
    pred0 = model.predict_quantiles_continuous(r0.to_numpy(dtype=float), quantiles=qs)
    pred1 = model.predict_quantiles_continuous(r1.to_numpy(dtype=float), quantiles=qs)
    delta = {t: float(pred0[t] - pred1[t]) for t in qs}
    gap_sets.append(delta)

robust_bottom = np.mean([np.mean([g[0.1], g[0.25]]) for g in gap_sets])
robust_top = np.mean([np.mean([g[0.75], g[0.9]]) for g in gap_sets])
print({'robust_bottom': robust_bottom, 'robust_top': robust_top})
'consistent with ' + ('sticky floor' if robust_bottom > robust_top else 'glass ceiling' if robust_top > robust_bottom else 'mixed/neutral')


In [None]:
# 按国家分组评估“粘地板/玻璃天花板”
# 仅保留用于构建设计矩阵的函数，其它旧评估函数已由新版覆盖

order_labels = ["Assistant/Junior", "Regular", "Leader", "Chief/Founder"]

def build_design_for_subset(df_sub: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray]:
    cols_needed = [
        'gender', 'highest_educational_degree',
        'whether_bachelor_university_prestigious',
        'internationalization', 'work_years', 'company_size', 'Y10'
    ]
    d = df_sub[cols_needed].dropna().copy()
    # y
    cat_type = pd.CategoricalDtype(categories=order_labels, ordered=True)
    d['y_cat'] = d['Y10'].astype(cat_type)
    d['y'] = d['y_cat'].cat.codes + 1
    # female
    gg = d['gender'].astype(str).str.strip().str.lower()
    d['female'] = (gg == 'female').astype(int)
    # prestige
    col_p = 'whether_bachelor_university_prestigious'
    if str(d[col_p].dtype) == 'bool':
        d['prestigious_bachelor'] = d[col_p].astype(int)
    else:
        d['prestigious_bachelor'] = (
            d[col_p].astype(str).str.strip().str.lower().isin(['true','1','yes','y','t'])
        ).astype(int)
    # numeric
    d['work_years'] = pd.to_numeric(d['work_years'], errors='coerce')
    d = d.dropna(subset=['work_years']).copy()
    # one-hot
    cat_cols = ['highest_educational_degree', 'internationalization', 'company_size']
    X_cat = pd.get_dummies(d[cat_cols], columns=cat_cols, drop_first=True, dtype=int)
    X_df = pd.concat([
        d[['female', 'prestigious_bachelor', 'work_years']].reset_index(drop=True),
        X_cat.reset_index(drop=True)
    ], axis=1)
    X = X_df.to_numpy(dtype=float)
    y = d['y'].to_numpy(dtype=int)
    return d, X_df, X, y


In [None]:
# ==================== 常量与配置（全局唯一） ====================
from typing import Dict, Tuple

# 分位点集合
QUANTILES_MAIN = (0.2, 0.5, 0.8)
QUANTILES_ROBUST = (0.1, 0.25, 0.5, 0.75, 0.9)
# 位置映射（用于标注 bottom/middle/top）
POS_MAP = {0.2: 'bottom', 0.5: 'middle', 0.8: 'top'}

# 门槛设定（对应 1..4 的有序类别）
LEVEL_THRESHOLDS = {1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0}

# 是否对门槛距离做非负截断（负值表示已跨过门槛）
TRUNCATE_NEGATIVE_DISTANCE = True

# subsample 规则
SUBSAMPLE_RULE = lambda n: (5000 if n > 20000 else (3000 if n > 2000 else None))

# 输出目录
import os
OUTPUT_DIR = "output_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# 统一拟合函数：量化配置在一处管理
def fit_model(X: np.ndarray, y: np.ndarray, taus=QUANTILES_MAIN, random_state: int = 0, use_two_index: bool = True) -> FrequentistOQR:
    m = FrequentistOQR(
        quantiles=taus,
        use_two_index=use_two_index,
        auto_select_k=True,
        subsample_n=SUBSAMPLE_RULE(len(y)),
        random_state=random_state,
    )
    m.fit(X, y)
    return m


In [None]:
# ==================== 核心分析1：底/中/顶三处的性别差（固定协变量）====================

def compute_gender_gaps_at_key_quantiles(model: FrequentistOQR, X_df_in: pd.DataFrame, 
                                        quantiles=QUANTILES_MAIN) -> Dict:
    """
    固定协变量在典型值，只切换性别，计算各分位点的female-male差异
    返回详细结果字典
    """
    # 构造代表性个体：连续变量取中位数，类别变量取基准类(全0)
    ref = {c: 0.0 for c in X_df_in.columns}
    ref['female'] = 0.0  # 男性基准
    if 'prestigious_bachelor' in X_df_in:
        ref['prestigious_bachelor'] = float(X_df_in['prestigious_bachelor'].median())
    if 'work_years' in X_df_in:
        ref['work_years'] = float(X_df_in['work_years'].median())
    
    # 男性和女性的预测
    X_male = pd.DataFrame([ref])
    X_female = pd.DataFrame([ref]); X_female['female'] = 1.0
    
    pred_male = model.predict_quantiles_continuous(X_male.to_numpy(dtype=float), quantiles=quantiles)
    pred_female = model.predict_quantiles_continuous(X_female.to_numpy(dtype=float), quantiles=quantiles)
    
    # 计算差异：female - male（负值表示女性预测更低，即处境更差）
    gaps = {}
    for tau in quantiles:
        gap = float(pred_female[tau] - pred_male[tau])
        gaps[tau] = {
            'quantile': tau,
            'male_pred': float(pred_male[tau]),
            'female_pred': float(pred_female[tau]), 
            'gap_female_minus_male': gap,
            'position': POS_MAP.get(tau, 'other')
        }
    
    # 判别粘地板vs玻璃天花板
    bottom_gap = float(gaps[0.2]['gap_female_minus_male']) if 0.2 in gaps else np.nan
    top_gap = float(gaps[0.8]['gap_female_minus_male']) if 0.8 in gaps else np.nan
    
    conclusion = 'glass ceiling' if abs(top_gap) > abs(bottom_gap) else 'sticky floor'
    gap_diff = bottom_gap - top_gap  # 正值倾向sticky floor
    
    return {
        'gaps_by_quantile': gaps,
        'bottom_gap_avg': bottom_gap,
        'top_gap_avg': top_gap, 
        'gap_diff_bottom_minus_top': gap_diff,
        'conclusion': conclusion,
        'reference_profile': dict(ref)
    }

# 应用到全样本
print("=== 全样本性别差分析 ===")
overall_gaps = compute_gender_gaps_at_key_quantiles(model, X_df)
for tau in ALL_QUANTILES:
    gap_info = overall_gaps['gaps_by_quantile'][tau]
    print(f"τ={tau:.1f} ({gap_info['position']}): Male={gap_info['male_pred']:.3f}, Female={gap_info['female_pred']:.3f}, Gap={gap_info['gap_female_minus_male']:.3f}")

print(f"\nBottom avg gap: {overall_gaps['bottom_gap_avg']:.3f}")
print(f"Top avg gap: {overall_gaps['top_gap_avg']:.3f}")
print(f"Conclusion: {overall_gaps['conclusion']} (diff={overall_gaps['gap_diff_bottom_minus_top']:.3f})")


NameError: name 'Dict' is not defined

In [None]:
# ==================== 核心分析2：离下一门槛的距离分析 ====================

def compute_threshold_distances(model: FrequentistOQR, X: np.ndarray, y: np.ndarray, 
                               X_df: pd.DataFrame, quantiles=ALL_QUANTILES, non_negative: bool = False) -> Dict:
    """
    计算每个个体在各分位点离下一门槛的距离，并分析性别差异
    - 当前级别使用观测 y_true 而非预测floor
    - non_negative=True 则对距离做非负截断
    """
    pred_cont = model.predict_quantiles_continuous(X, quantiles=quantiles)
    results = {}
    for tau in quantiles:
        y_pred_cont = pred_cont[tau]
        distances, current_levels, target_levels = [], [], []
        for y_pred, y_true_i in zip(y_pred_cont, y):
            current_level = int(np.clip(y_true_i, 1, 5))
            if current_level >= 5:
                distances.append(np.nan)
                current_levels.append(current_level)
                target_levels.append(np.nan)
                continue
            target_level = current_level + 1
            d_raw = float(target_level - y_pred)
            d_use = max(0.0, d_raw) if non_negative else d_raw
            distances.append(d_use)
            current_levels.append(current_level)
            target_levels.append(target_level)
        df_dist = pd.DataFrame({
            'y_pred_cont': y_pred_cont,
            'y_true': y,
            'current_level': current_levels,
            'target_level': target_levels,
            'distance_to_next': distances,
            'female': X_df['female'].values if 'female' in X_df.columns else 0
        })
        df_valid = df_dist.dropna(subset=['distance_to_next'])
        if len(df_valid) > 0:
            gender_distances = df_valid.groupby('female')['distance_to_next'].agg(['mean', 'std', 'count']).reset_index()
            gender_distances['gender'] = gender_distances['female'].map({0: 'male', 1: 'female'})
            if len(gender_distances) == 2:
                male_dist = gender_distances.loc[gender_distances['gender']=='male','mean'].iloc[0]
                female_dist = gender_distances.loc[gender_distances['gender']=='female','mean'].iloc[0]
                gap = float(female_dist - male_dist)
            else:
                male_dist = female_dist = gap = np.nan
        else:
            gender_distances = pd.DataFrame()
            male_dist = female_dist = gap = np.nan
        results[tau] = {
            'quantile': tau,
            'position': 'bottom' if tau in BOTTOM_QUANTILES else ('top' if tau in TOP_QUANTILES else 'middle'),
            'data': df_dist,
            'gender_summary': gender_distances,
            'male_avg_distance': male_dist,
            'female_avg_distance': female_dist,
            'gap_female_minus_male': gap
        }
    return results

# 计算全样本的门槛距离
print("=== 全样本门槛距离分析 ===")
threshold_results = compute_threshold_distances(model, X, y, X_df)

for tau in ALL_QUANTILES:
    result = threshold_results[tau]
    print(f"\nτ={tau:.1f} ({result['position']}):")
    print(f"  Male avg distance to next level: {result['male_avg_distance']:.3f}")
    print(f"  Female avg distance to next level: {result['female_avg_distance']:.3f}")
    print(f"  Gap (Female - Male): {result['gap_female_minus_male']:.3f}")
    if result['gap_female_minus_male'] > 0:
        print(f"  → 女性平均离晋升门槛更远 {result['gap_female_minus_male']:.3f} 个等级单位")
    elif result['gap_female_minus_male'] < 0:
        print(f"  → 男性平均离晋升门槛更远 {abs(result['gap_female_minus_male']):.3f} 个等级单位")


In [None]:
# 稳健性补充：真正切换“基类代表”的参考行（示例）
# 思路：在 drop_first=True 下，默认全0代表基类；若要用非基类当“代表”，把对应 one-hot 置1，其余同组置0

def make_ref_row_with_baselines(female_val: int,
                                base_work_years: float | None,
                                base_categories: dict[str, str]) -> pd.DataFrame:
    row = {c: 0.0 for c in X_df.columns}
    row['female'] = float(female_val)
    if 'prestigious_bachelor' in X_df.columns:
        row['prestigious_bachelor'] = float(X_df['prestigious_bachelor'].median())
    if 'work_years' in X_df.columns:
        row['work_years'] = float(base_work_years if base_work_years is not None else X_df['work_years'].median())
    # 为每个类别前缀设置代表类
    prefix_map = {
        'highest_educational_degree_': base_categories.get('highest_educational_degree'),
        'internationalization_': base_categories.get('internationalization'),
        'company_size_': base_categories.get('company_size'),
    }
    for prefix, want_suffix in prefix_map.items():
        if not want_suffix:
            continue
        for c in X_df.columns:
            if c.startswith(prefix):
                row[c] = 1.0 if c.endswith(want_suffix) else 0.0
    return pd.DataFrame([row])

# 最小示例（仅示意，按你实际 one-hot 列后缀填写）
# r0 = make_ref_row_with_baselines(0, X_df['work_years'].quantile(0.5), {
#     'highest_educational_degree': 'Master',
#     'internationalization': 'Multinational',
#     'company_size': '10,001+ employees',
# })
# r1 = r0.copy(); r1['female'] = 1.0
# pred0 = model.predict_quantiles_continuous(r0.to_numpy(dtype=float), quantiles=ALL_QUANTILES)
# pred1 = model.predict_quantiles_continuous(r1.to_numpy(dtype=float), quantiles=ALL_QUANTILES)
# print({t: float(pred1[t] - pred0[t]) for t in ALL_QUANTILES})


In [None]:
# ==================== 核心分析3：按国家分组的完整分析 ====================

def analyze_by_country_detailed(min_n: int = 500) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    按国家进行完整的性别差和门槛距离分析
    返回两个汇总表：gender_gap_summary, threshold_distance_summary
    """
    gap_rows = []
    threshold_rows = []
    
    print("=== 按国家分析进度 ===")
    for ctry, df_g in df_seniority.groupby('country'):
        d, Xdf, Xg, yg = build_design_for_subset(df_g)
        n = len(yg)
        if n < min_n:
            print(f"{ctry}: 跳过 (n={n} < {min_n})")
            continue
        
        print(f"{ctry}: 分析中 (n={n})...")
        
        # 拟合模型
        m = FrequentistOQR(
            quantiles=(0.2, 0.5, 0.8),
            use_two_index=True,
            auto_select_k=True,
            subsample_n=5000 if n > 20000 else (3000 if n > 2000 else None),
            random_state=0,
        )
        m.fit(Xg, yg)
        
        # 性别差分析
        gaps = compute_gender_gaps_at_key_quantiles(m, Xdf)
        for tau in ALL_QUANTILES:
            gap_info = gaps['gaps_by_quantile'][tau]
            gap_rows.append({
                'country': str(ctry),
                'n': n,
                'quantile': tau,
                'position': gap_info['position'],
                'male_pred': gap_info['male_pred'],
                'female_pred': gap_info['female_pred'],
                'gap_female_minus_male': gap_info['gap_female_minus_male']
            })
        
        # 门槛距离分析
        thresh_results = compute_threshold_distances(m, Xg, yg, Xdf)
        for tau in ALL_QUANTILES:
            thresh_info = thresh_results[tau]
            threshold_rows.append({
                'country': str(ctry),
                'n': n,
                'quantile': tau,
                'position': thresh_info['position'],
                'male_avg_distance': thresh_info['male_avg_distance'],
                'female_avg_distance': thresh_info['female_avg_distance'],
                'gap_female_minus_male': thresh_info['gap_female_minus_male']
            })
    
    gap_df = pd.DataFrame(gap_rows)
    threshold_df = pd.DataFrame(threshold_rows)
    
    return gap_df, threshold_df

# 执行分国分析
print("开始按国家的详细分析...")
country_gaps_df, country_thresholds_df = analyze_by_country_detailed(min_n=500)

print(f"\n完成！涵盖 {country_gaps_df['country'].nunique()} 个国家")
print("\n=== 性别差汇总（前10行）===")
print(country_gaps_df.head(10))

print("\n=== 门槛距离汇总（前10行）===")
print(country_thresholds_df.head(10))


In [None]:
# ==================== 可视化：三合一图表 ====================

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Gender Gap Analysis: Sticky Floor vs Glass Ceiling Effects', fontsize=16)

# 图1：全样本性别差曲线（左上）
ax1 = axes[0, 0]
gap_data = [(tau, overall_gaps['gaps_by_quantile'][tau]['gap_female_minus_male']) for tau in ALL_QUANTILES]
taus, gaps = zip(*gap_data)
ax1.plot(taus, gaps, 'o-', linewidth=2, markersize=8)
ax1.axhline(0, color='gray', linestyle='--', alpha=0.7)
ax1.axvspan(0.0, 0.3, color='blue', alpha=0.1, label='Bottom zone')
ax1.axvspan(0.7, 1.0, color='red', alpha=0.1, label='Top zone')
ax1.set_xlabel('Quantile (τ)')
ax1.set_ylabel('Gap (Female - Male)')
ax1.set_title('A. Overall Gender Gap Profile')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 图2：门槛距离对比（右上）
ax2 = axes[0, 1]
threshold_data = [(tau, threshold_results[tau]['gap_female_minus_male']) for tau in ALL_QUANTILES]
taus_t, gaps_t = zip(*threshold_data)
colors = ['blue' if tau in BOTTOM_QUANTILES else ('red' if tau in TOP_QUANTILES else 'green') for tau in taus_t]
bars = ax2.bar(range(len(taus_t)), gaps_t, color=colors, alpha=0.7)
ax2.axhline(0, color='gray', linestyle='--', alpha=0.7)
ax2.set_xlabel('Position')
ax2.set_ylabel('Gap in Distance to Next Level\n(Female - Male)')
ax2.set_title('B. Threshold Distance Gender Gap')
ax2.set_xticks(range(len(taus_t)))
ax2.set_xticklabels([f'τ={tau:.1f}' for tau in taus_t])
ax2.grid(True, alpha=0.3)

# 图3：按国家的性别差热图（左下）
ax3 = axes[1, 0]
if len(country_gaps_df) > 0:
    pivot_gaps = country_gaps_df.pivot(index='country', columns='quantile', values='gap_female_minus_male')
    im1 = ax3.imshow(pivot_gaps.values, aspect='auto', cmap='RdBu_r', vmin=-0.5, vmax=0.5)
    ax3.set_xticks(range(len(pivot_gaps.columns)))
    ax3.set_xticklabels([f'τ={tau:.1f}' for tau in pivot_gaps.columns])
    ax3.set_yticks(range(len(pivot_gaps.index)))
    ax3.set_yticklabels(pivot_gaps.index, fontsize=8)
    ax3.set_title('C. Gender Gap by Country')
    ax3.set_xlabel('Quantile')
    plt.colorbar(im1, ax=ax3, label='Gap (Female - Male)')
else:
    ax3.text(0.5, 0.5, 'No country data available', ha='center', va='center')
    ax3.set_title('C. Gender Gap by Country')

# 图4：按国家的门槛距离热图（右下）
ax4 = axes[1, 1]
if len(country_thresholds_df) > 0:
    pivot_thresholds = country_thresholds_df.pivot(index='country', columns='quantile', values='gap_female_minus_male')
    im2 = ax4.imshow(pivot_thresholds.values, aspect='auto', cmap='RdBu_r', vmin=-0.3, vmax=0.3)
    ax4.set_xticks(range(len(pivot_thresholds.columns)))
    ax4.set_xticklabels([f'τ={tau:.1f}' for tau in pivot_thresholds.columns])
    ax4.set_yticks(range(len(pivot_thresholds.index)))
    ax4.set_yticklabels(pivot_thresholds.index, fontsize=8)
    ax4.set_title('D. Threshold Distance Gap by Country')
    ax4.set_xlabel('Quantile')
    plt.colorbar(im2, ax=ax4, label='Gap (Female - Male)')
else:
    ax4.text(0.5, 0.5, 'No threshold data available', ha='center', va='center')
    ax4.set_title('D. Threshold Distance Gap by Country')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/gender_gap_analysis.pdf', dpi=300, bbox_inches='tight')
plt.show()

# 打印关键结论
print("=" * 60)
print("关键研究结论")
print("=" * 60)
print(f"全样本结论: {overall_gaps['conclusion'].upper()}")
print(f"  - 底部差值: {overall_gaps['bottom_gap_avg']:.3f}")
print(f"  - 顶部差值: {overall_gaps['top_gap_avg']:.3f}")
print(f"  - 差值差异: {overall_gaps['gap_diff_bottom_minus_top']:.3f}")

if len(country_gaps_df) > 0:
    # 按国家统计结论分布
    country_conclusions = []
    for country in country_gaps_df['country'].unique():
        country_data = country_gaps_df[country_gaps_df['country'] == country]
        bottom_avg = country_data[country_data['position'] == 'bottom']['gap_female_minus_male'].mean()
        top_avg = country_data[country_data['position'] == 'top']['gap_female_minus_male'].mean()
        conclusion = 'glass ceiling' if abs(top_avg) > abs(bottom_avg) else 'sticky floor'
        country_conclusions.append({'country': country, 'conclusion': conclusion, 
                                   'bottom_gap': bottom_avg, 'top_gap': top_avg})
    
    


In [None]:
# ==================== 结果保存：三个核心输出文件 ====================

# 1. 性别差分位表
gender_gap_export = []
for tau in ALL_QUANTILES:
    gap_info = overall_gaps['gaps_by_quantile'][tau]
    gender_gap_export.append({
        'analysis_level': 'overall',
        'country': 'ALL',
        'quantile': tau,
        'position': gap_info['position'],
        'male_pred': gap_info['male_pred'],
        'female_pred': gap_info['female_pred'],
        'gap_female_minus_male': gap_info['gap_female_minus_male']
    })

# 合并国家级数据
if len(country_gaps_df) > 0:
    country_gap_export = country_gaps_df.copy()
    country_gap_export['analysis_level'] = 'country'
    country_gap_export = country_gap_export.rename(columns={
        'gap_female_minus_male': 'gap_female_minus_male'
    })
    
    gap_combined = pd.concat([
        pd.DataFrame(gender_gap_export),
        country_gap_export[['analysis_level', 'country', 'quantile', 'position', 
                          'male_pred', 'female_pred', 'gap_female_minus_male']]
    ], ignore_index=True)
else:
    gap_combined = pd.DataFrame(gender_gap_export)

gap_combined.to_csv(f'{OUTPUT_DIR}/gender_gap_by_quantile.csv', index=False)
print(f"✓ 性别差分位表已保存: {OUTPUT_DIR}/gender_gap_by_quantile.csv")

# 2. 门槛距离表  
threshold_export = []
for tau in ALL_QUANTILES:
    thresh_info = threshold_results[tau]
    threshold_export.append({
        'analysis_level': 'overall',
        'country': 'ALL',
        'quantile': tau,
        'position': thresh_info['position'],
        'male_avg_distance': thresh_info['male_avg_distance'],
        'female_avg_distance': thresh_info['female_avg_distance'],
        'gap_female_minus_male': thresh_info['gap_female_minus_male']
    })

# 合并国家级门槛数据
if len(country_thresholds_df) > 0:
    country_thresh_export = country_thresholds_df.copy()
    country_thresh_export['analysis_level'] = 'country'
    
    thresh_combined = pd.concat([
        pd.DataFrame(threshold_export),
        country_thresh_export[['analysis_level', 'country', 'quantile', 'position',
                             'male_avg_distance', 'female_avg_distance', 'gap_female_minus_male']]
    ], ignore_index=True)
else:
    thresh_combined = pd.DataFrame(threshold_export)

thresh_combined.to_csv(f'{OUTPUT_DIR}/threshold_distance.csv', index=False)
print(f"✓ 门槛距离表已保存: {OUTPUT_DIR}/threshold_distance.csv")

# 3. 汇总结论表
summary_export = [{
    'analysis_level': 'overall',
    'country': 'ALL',
    'bottom_gap_avg': overall_gaps['bottom_gap_avg'],
    'top_gap_avg': overall_gaps['top_gap_avg'],
    'gap_diff_bottom_minus_top': overall_gaps['gap_diff_bottom_minus_top'],
    'conclusion': overall_gaps['conclusion'],
    'n_observations': len(y)
}]

if len(country_gaps_df) > 0 and 'country_summary' in locals():
    for _, row in country_summary.iterrows():
        summary_export.append({
            'analysis_level': 'country',
            'country': row['country'],
            'bottom_gap_avg': row['bottom_gap'],
            'top_gap_avg': row['top_gap'],
            'gap_diff_bottom_minus_top': row['bottom_gap'] - row['top_gap'],
            'conclusion': row['conclusion'],
            'n_observations': country_gaps_df[country_gaps_df['country'] == row['country']].iloc[0]['n']
        })

summary_df = pd.DataFrame(summary_export)
summary_df.to_csv(f'{OUTPUT_DIR}/analysis_summary.csv', index=False)
print(f"✓ 汇总结论表已保存: {OUTPUT_DIR}/analysis_summary.csv")

# 保存模型配置信息
config_info = {
    'model_config': model.get_config(),
    'quantiles_analyzed': ALL_QUANTILES,
    'bottom_quantiles': BOTTOM_QUANTILES,
    'top_quantiles': TOP_QUANTILES,
    'level_thresholds': LEVEL_THRESHOLDS,
    'control_variables': feature_cols,
    'sample_size_total': len(y),
    'countries_analyzed': country_gaps_df['country'].nunique() if len(country_gaps_df) > 0 else 0,
    'analysis_timestamp': pd.Timestamp.now().isoformat()
}

import json
with open(f'{OUTPUT_DIR}/analysis_config.json', 'w') as f:
    json.dump(config_info, f, indent=2, default=str)
print(f"✓ 分析配置已保存: {OUTPUT_DIR}/analysis_config.json")

print(f"\n🎉 分析完成！所有结果已保存到 '{OUTPUT_DIR}/' 目录")
print("\n核心文件清单:")
print(f"  📊 gender_gap_by_quantile.csv - 性别差分位详表")
print(f"  📏 threshold_distance.csv - 门槛距离分析详表") 
print(f"  📋 analysis_summary.csv - 粘地板/玻璃天花板结论汇总")
print(f"  📈 gender_gap_analysis.pdf - 四合一可视化图表")
print(f"  ⚙️  analysis_config.json - 模型配置与元数据")

# 最终研究结论摘要
print("\n" + "=" * 80)
print("🔍 最终研究结论摘要")
print("=" * 80)
print(f"研究问题: 分析 Y10 职级中的性别差异模式")
print(f"分析方法: FrequentistOQR (TORQUE) 有序分位数回归")
print(f"控制变量: {', '.join(feature_cols)}")
print(f"样本规模: {len(y):,} 个观测")
if len(country_gaps_df) > 0:
    print(f"国家覆盖: {country_gaps_df['country'].nunique()} 个国家")

print(f"\n全样本主要发现:")
print(f"  🎯 主要模式: {overall_gaps['conclusion'].upper()}")
if overall_gaps['conclusion'] == 'sticky floor':
    print(f"  📍 底部分位性别差更大 ({overall_gaps['bottom_gap_avg']:.3f} vs {overall_gaps['top_gap_avg']:.3f})")
    print(f"     → 女性在低职级阶段面临更大晋升障碍")
else:
    print(f"  🏢 顶部分位性别差更大 ({overall_gaps['top_gap_avg']:.3f} vs {overall_gaps['bottom_gap_avg']:.3f})")
    print(f"     → 女性在高职级阶段面临更大晋升障碍")

print(f"\n可直接用于报告的关键数据:")
for tau in ALL_QUANTILES:
    gap_info = overall_gaps['gaps_by_quantile'][tau]
    thresh_info = threshold_results[tau]
    print(f"  τ={tau:.1f} ({gap_info['position']}): 性别差={gap_info['gap_female_minus_male']:.3f}, 门槛距离差={thresh_info['gap_female_minus_male']:.3f}")


### 如何用 OQR 读五级职级的性别差异（白话版）

把 5 个职级想成一段楼梯：1→2→3→4→5。每两个级之间都有一道“门槛”。每个人心里其实有个“晋升准备度”的分数（看不见的尺子），分数越高，就越往楼梯上走。我们只看到他现在站在第几级，看不到尺子上的精细刻度。

OQR 在做什么：不是算“上不上某一级的概率”，而是看“同样条件的人里，处在底部/中部/顶部那群人的‘准备度’位置”，男女之间差多少。我们把“底/中/顶”设为三个参照位置（例如 τ=20%/50%/80%）。这三个位置是“看不见的尺子”上的段位，不等于 1..5 的类别本身。

我们怎么操作：
- 固定协变量（教育、工龄、行业、公司规模等）
- 在底/中/顶三个位置，计算“女性 − 男性”的准备度差（连续刻度，不做离散化）
- 同时，计算“离下一道门槛的距离”distance_up = (j+1) − ŷ（例如在第3级，离升到第4级还差多少）

怎么读“粘地板/玻璃天花板”：
- 粘地板 sticky floor：女性在“底部位置”的劣势更明显（底部差更负）
- 玻璃天花板 glass ceiling：女性在“顶部位置”的劣势更明显（顶部差更负）

政策解读：若底部女性更吃亏，优先做“入门能力/基础培训”；若顶部更吃亏，重点审视“高位晋升的评审流程、赞助人/导师制度”等。
