In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Colab 進行matplotlib繪圖時顯示繁體中文
# 下載台北思源黑體並命名taipei_sans_tc_beta.ttf，移至指定路徑
!wget -O TaipeiSansTCBeta-Regular.ttf https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download

import matplotlib

# 改style要在改font之前
# plt.style.use('seaborn')

matplotlib.font_manager.fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
matplotlib.rc('font', family='Taipei Sans TC Beta')

--2025-07-22 16:28:06--  https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_
Resolving drive.google.com (drive.google.com)... 172.217.203.100, 172.217.203.101, 172.217.203.138, ...
Connecting to drive.google.com (drive.google.com)|172.217.203.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_ [following]
--2025-07-22 16:28:06--  https://drive.usercontent.google.com/download?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 173.194.210.132, 2607:f8b0:400c:c0f::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|173.194.210.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20659344 (20M) [application/octet-stream]
Saving to: ‘TaipeiSansTCBeta-Regular.ttf’


2025-07-22 16:28:12 (55.5 MB/s) - ‘TaipeiSansTCBeta-Regular.ttf’ saved [20659344/20659344]


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import math

In [4]:
# ✅ 法官姓名 → 匿名代號 對照表
judge_mapping = {
    '鄭琬薇': 'PT01', '張嘉芬': 'SLE01', '許瑜容': 'CT01', '楊博欽': 'CT02', '蔡書瑜': 'KS01',
    '蔡英雌': 'KS02', '黎錦福': 'PC01', '呂佩珊': 'KS03', '洪毓良': 'KS04', '洪韻婷': 'KS05',
    '張俊文': 'CT03', '吳俞玲': 'KS06', '傅曉瑄': 'CPE01', '張瀞文': 'KS07', '李爭春': 'KS08',
    '林于心': 'KS09', '王俊彥': 'KS10', '黃致毅': 'TY01', '胡家瑋': 'KS11', '蘇品蓁': 'TY02',
    '洪韻筑': 'KS12', '郭鍵融': 'TY03', '林柏壽': 'KS13', '黃三友': 'KS14', '朱慧真': 'KS15',
    '朱盈吉': 'CT04', '高俊珊': 'TN01', '李宜穎': 'KS16', '詹尚晃': 'KS17', '黃傳堯': 'KS18',
    '莊維澤': 'KS19', '劉容妤': 'PT02', '方錦源': 'KS20', '王秀慧': 'TY04', '曾正龍': 'TP01',
    '陳嘉臨': 'CY01', '施添寶': 'KL01', '高如宜': 'TN02', '王雪君': 'KS22', '陳怡君': 'TC01',
    '楊甯伃': 'KS23', '郭德進': 'TC02', '李蕙伶': 'KS24', '曾雨明': 'TY05', '李岳': 'IL01',
    '孫于淦': 'NT01', '陳華媚': 'TY06', '鍾邦久': 'TN03', '陳美芳': 'KS25', '羅郁棣': 'TN04',
    '梁義順': 'CH01','呂明燕' : 'KS21'
}

In [5]:
pd.set_option("display.max_rows", None)

In [6]:
judgemnt_df = pd.read_csv("/content/gdrive/MyDrive/NCHU/論文/酒駕資料/2016to22_判決結構化資料_清洗整理後_有序資料編碼修正後v2_含法官_含酒駕前案紀錄.csv", encoding="utf-8", index_col=0)

In [7]:
judgemnt_df.head()

Unnamed: 0,案號,刑期,呼氣酒精濃度,是否累犯,是否坦承犯行,有無出現交通事故,有無乘客,是否為職業駕駛,交通工具,家庭經濟,教育程度,除酒駕以外的交通違規,易科罰金,併科罰金,年份,法官,法院,酒駕前案紀錄
0,"CHDM,105,交簡,156,20160130,1",3,0.36,0,1,1,0,0,4,3,3,0,1,0.0,105,林于捷,CHDM,1.0
1,"CHDM,105,交簡,6,20160114,1",6,1.22,1,1,0,0,0,2,1,2,0,1,0.0,105,張佳燉,CHDM,2.0
2,"CHDM,105,審交簡,4,20160126,1",4,0.53,0,1,0,0,0,2,0,2,2,1,20.0,105,呂美玲,CHDM,2.0
3,"CPEM,105,竹北交簡,2,20160127,1",2,0.865,0,1,1,0,0,2,0,4,0,1,0.0,105,傅曉瑄,CPEM,1.0
4,"CPEM,105,竹北交簡,30,20160127,1",4,0.27,1,1,0,2,0,2,0,3,0,1,0.0,105,傅曉瑄,CPEM,2.0


In [8]:
judgemnt_df['酒駕前案紀錄'].value_counts()

Unnamed: 0_level_0,count
酒駕前案紀錄,Unnamed: 1_level_1
1.0,14111
0.0,8532
2.0,4477
3.0,1661
4.0,510
5.0,121
6.0,38
7.0,10
8.0,5
10.0,1


In [9]:
import numpy as np
import pandas as pd

# 假設 s 是你的 Series
s = judgemnt_df['酒駕前案紀錄']

# 1. 找出最小值與最大值
vmin, vmax = s.min(), s.max()

# 2. 用 np.linspace 把 [vmin, vmax] 平均切成 4 段，需要 5 個點
bins = np.linspace(vmin, vmax, num=5)
print("等距分箱邊界：", bins)

# 3. 分箱
labels = ['Bin1','Bin2','Bin3','Bin4']
value_bins = pd.cut(s, bins=bins, labels=labels, include_lowest=True)

# 4. 看每個 bin 的數值範圍和筆數（筆數只是檢查，不影響分割邏輯）
print("\n各 Bin 範圍與筆數：")
for lab in labels:
    idx = value_bins == lab
    print(f"{lab:5s} ({bins[labels.index(lab)]:.1f}–{bins[labels.index(lab)+1]:.1f}):",
          idx.sum(), "筆")

# 5. 若只想看每個 bin 的數值區間，不看筆數：
intervals = value_bins.cat.categories
print("\n各 Bin 的區間：")
for lab, interval in zip(labels, intervals):
    print(f"{lab}: {interval}")


等距分箱邊界： [ 0.   2.5  5.   7.5 10. ]

各 Bin 範圍與筆數：
Bin1  (0.0–2.5): 27120 筆
Bin2  (2.5–5.0): 2292 筆
Bin3  (5.0–7.5): 48 筆
Bin4  (7.5–10.0): 6 筆

各 Bin 的區間：
Bin1: Bin1
Bin2: Bin2
Bin3: Bin3
Bin4: Bin4


In [10]:
data = judgemnt_df.copy()

In [11]:
#data['刑期'] >=2 <=6
data = data[(data['刑期'] >= 2) & (data['刑期'] <= 6)]

In [12]:
bins = [-float('inf'), 0.24, 0.49, 0.74, 0.99, 1.49, 1.99, 3.49, float('inf')]
labels = [1, 2, 3, 4, 5, 6, 7, 8]  # 數值對應類別

# 假設 df 中有一個欄位名稱為 '呼氣酒精濃度'
data['酒精濃度類別'] = pd.cut(data['呼氣酒精濃度'], bins=bins, labels=labels, right=True).astype(int)

In [13]:
data.head()

Unnamed: 0,案號,刑期,呼氣酒精濃度,是否累犯,是否坦承犯行,有無出現交通事故,有無乘客,是否為職業駕駛,交通工具,家庭經濟,教育程度,除酒駕以外的交通違規,易科罰金,併科罰金,年份,法官,法院,酒駕前案紀錄,酒精濃度類別
0,"CHDM,105,交簡,156,20160130,1",3,0.36,0,1,1,0,0,4,3,3,0,1,0.0,105,林于捷,CHDM,1.0,2
1,"CHDM,105,交簡,6,20160114,1",6,1.22,1,1,0,0,0,2,1,2,0,1,0.0,105,張佳燉,CHDM,2.0,5
2,"CHDM,105,審交簡,4,20160126,1",4,0.53,0,1,0,0,0,2,0,2,2,1,20.0,105,呂美玲,CHDM,2.0,3
3,"CPEM,105,竹北交簡,2,20160127,1",2,0.865,0,1,1,0,0,2,0,4,0,1,0.0,105,傅曉瑄,CPEM,1.0,4
4,"CPEM,105,竹北交簡,30,20160127,1",4,0.27,1,1,0,2,0,2,0,3,0,1,0.0,105,傅曉瑄,CPEM,2.0,2


####年份實驗(目前不做)

In [None]:
# 命名統一（建議你先 rename）
'''
df = df.rename(columns={
    '是否累犯': 'recidivism',
    '酒駕前案紀錄':'preDUI_record',
    '酒精濃度類別': 'bac_level',
    '是否坦承犯行': 'admit',
    '有無出現交通事故': 'accident',
    '有無乘客': 'passenger',
    '是否為職業駕駛': 'professional_driver',
    '交通工具': 'vehicle',
    '家庭經濟': 'income',
    '教育程度': 'education',
    '除酒駕以外的交通違規': 'other_violation',
    '刑期': 'sentence_months'
})
'''

In [None]:
df_pre = df[(df['年份'] >= 105) & (df['年份'] <= 107)]
df_post = df[(df['年份'] >= 109) & (df['年份'] <= 111)]

In [None]:
df_pre.drop(columns=['案號', '呼氣酒精濃度','易科罰金','併科罰金','年份'], inplace=True)
df_post.drop(columns=['案號', '呼氣酒精濃度','易科罰金','併科罰金','年份'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pre.drop(columns=['案號', '呼氣酒精濃度','易科罰金','併科罰金','年份'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_post.drop(columns=['案號', '呼氣酒精濃度','易科罰金','併科罰金','年份'], inplace=True)


In [None]:
df_pre.head()


Unnamed: 0,sentence_months,recidivism,admit,accident,passenger,professional_driver,vehicle,income,education,other_violation,bac_level
11,3.0,0,1,1,0,0,3,3,3,2,2
14,6.0,1,1,0,0,0,4,1,2,2,5
15,4.0,0,1,0,0,0,4,0,2,5,3
28,2.0,0,1,1,0,0,4,0,4,0,4
29,4.0,1,1,0,2,0,4,0,3,2,2


In [None]:
df_post.head()

Unnamed: 0,sentence_months,recidivism,admit,accident,passenger,professional_driver,vehicle,income,education,other_violation,bac_level
2,4.0,1,1,0,0,0,5,0,1,2,2
3,3.0,1,1,0,0,0,4,1,1,2,2
8,3.0,0,2,1,2,0,4,3,3,1,3
12,5.0,1,1,0,0,0,3,2,3,2,3
14,5.0,0,1,0,0,0,4,2,3,5,4


In [None]:
import statsmodels.formula.api as smf


# 建立回歸模型（控制變數請依你資料量調整）
formula = 'sentence_months ~ recidivism + bac_level + accident + admit + education + income'

model_pre = smf.ols(formula=formula, data=df_pre).fit()
model_post = smf.ols(formula=formula, data=df_post).fit()

print("=== 修法前（2016–2018）回歸結果 ===")
print(model_pre.summary())

print("\n=== 修法後（2020–2022）回歸結果 ===")
print(model_post.summary())


=== 修法前（2016–2018）回歸結果 ===
                            OLS Regression Results                            
Dep. Variable:        sentence_months   R-squared:                       0.431
Model:                            OLS   Adj. R-squared:                  0.431
Method:                 Least Squares   F-statistic:                     2319.
Date:                Tue, 01 Apr 2025   Prob (F-statistic):               0.00
Time:                        22:26:53   Log-Likelihood:                -25060.
No. Observations:               18361   AIC:                         5.013e+04
Df Residuals:                   18354   BIC:                         5.019e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.3781    

In [None]:

from scipy.stats import ttest_ind

In [None]:
def ttest_recidivism(group_df, label):
    rec = group_df[group_df['recidivism'] == 1]['sentence_months']
    nonrec = group_df[group_df['recidivism'] == 0]['sentence_months']
    t_stat, p_val = ttest_ind(rec, nonrec, equal_var=False)

    print(f"\n📊 {label}：累犯 vs 非累犯 T-test")
    print(f"累犯者平均刑期：{rec.mean():.2f} 月")
    print(f"非累犯者平均刑期：{nonrec.mean():.2f} 月")
    print(f"差值：{rec.mean() - nonrec.mean():.2f} 月")
    print(f"p-value：{p_val:.4f}")

ttest_recidivism(df_pre, "修法前")
ttest_recidivism(df_post, "修法後")



📊 修法前：累犯 vs 非累犯 T-test
累犯者平均刑期：4.49 月
非累犯者平均刑期：2.79 月
差值：1.69 月
p-value：0.0000

📊 修法後：累犯 vs 非累犯 T-test
累犯者平均刑期：4.54 月
非累犯者平均刑期：2.97 月
差值：1.57 月
p-value：0.0000


####2-1

##### 酒駕前按原始紀錄


法官

In [17]:
import numpy as np
import pandas as pd

df = data.copy()

# 2. 重建 case_code（用分箱後的 prior_record_bin 取代數值列）
feature_cols = [
    '酒精濃度類別', '是否累犯', '酒駕前案紀錄', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名、保留必要欄位
df = df.rename(columns={'刑期': 'sentence', '法官': 'judge', '法院': 'court'})

# 最小樣本門檻（每個法院對同一組特徵組合 ≥ 20 筆）
MIN_N = 20
records_court_test = []

# ✅ 對每個法院的每個案件組合進行檢查
for (court, case_code), sub in df.groupby(['court', 'case_code']):
    if len(sub) < MIN_N:
        continue

    sentences = sub['sentence'].values
    n_unique = len(np.unique(sentences))
    std = np.std(sentences)
    q75, q25 = np.percentile(sentences, 75), np.percentile(sentences, 25)
    iqr = q75 - q25

    records_court_test.append({
        'court': court,
        'case_code': case_code,
        'n_total': len(sub),
        'n_unique_sentences': n_unique,
        'std_dev': std,
        'iqr': iqr,
        'fully_consistent': (n_unique == 1)
    })

# 建立結果表
df_court_test = pd.DataFrame(records_court_test)
df_court_test.to_csv('酒精原始紀錄_court_consistency_summary.csv', index=False, encoding='UTF-8-Sig')

# ✅ 最終輸出統計摘要
print(f"實驗2-1（酒精原始紀錄_法院版本）：共 {len(df_court_test)} 筆檢定結果")
print(f"完全一致（所有刑期相同）：{df_court_test['fully_consistent'].sum()} 筆")
print("刑期唯一值數量分布：")
print(df_court_test['n_unique_sentences'].value_counts().sort_index())
print("\n標準差與 IQR 描述統計：")
print(df_court_test[['std_dev', 'iqr']].describe())


實驗2-1（酒精原始紀錄_法院版本）：共 96 筆檢定結果
完全一致（所有刑期相同）：11 筆
刑期唯一值數量分布：
n_unique_sentences
1    11
2    25
3    23
4    31
5     6
Name: count, dtype: int64

標準差與 IQR 描述統計：
         std_dev        iqr
count  96.000000  96.000000
mean    0.549903   0.643229
std     0.308483   0.583484
min     0.000000   0.000000
25%     0.292028   0.000000
50%     0.606199   1.000000
75%     0.800422   1.000000
max     1.047664   2.000000


法官

In [18]:
import numpy as np
import pandas as pd

df = data.copy()

# 2. case_code
feature_cols = [
    '酒精濃度類別', '是否累犯', '酒駕前案紀錄', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名、保留必要欄位
df = df.rename(columns={'刑期': 'sentence', '法官': 'judge', '法院': 'court'})


# 最小樣本門檻（每位法官對同一組特徵組合 ≥ 20 筆）
MIN_N = 20
records_self_test = []

# ✅ 對每位法官的每個案件組合進行檢查
for (judge, case_code), sub in df.groupby(['judge', 'case_code']):
    if len(sub) < MIN_N:
        continue

    sentences = sub['sentence'].values
    n_unique = len(np.unique(sentences))
    std = np.std(sentences)
    q75, q25 = np.percentile(sentences, 75), np.percentile(sentences, 25)
    iqr = q75 - q25

    records_self_test.append({
        'judge': judge,
        'case_code': case_code,
        'n_total': len(sub),
        'n_unique_sentences': n_unique,
        'std_dev': std,
        'iqr': iqr,
        'fully_consistent': (n_unique == 1)
    })

# 建立結果表
df_self_test = pd.DataFrame(records_self_test)
df_self_test.to_csv('酒精原始紀錄_self_consistency_summary.csv', index=False, encoding='UTF-8-Sig')

# ✅ 最終輸出統計摘要
print(f"酒精原始紀錄_實驗2-1：共 {len(df_self_test)} 筆檢定結果")
print(f"完全一致（酒精原始紀錄_所有刑期相同）：{df_self_test['fully_consistent'].sum()} 筆")
print("刑期唯一值數量分布：")
print(df_self_test['n_unique_sentences'].value_counts().sort_index())
print("\n標準差與 IQR 描述統計：")
print(df_self_test[['std_dev', 'iqr']].describe())


酒精原始紀錄_實驗2-1：共 22 筆檢定結果
完全一致（酒精原始紀錄_所有刑期相同）：9 筆
刑期唯一值數量分布：
n_unique_sentences
1    9
2    8
3    3
4    1
5    1
Name: count, dtype: int64

標準差與 IQR 描述統計：
         std_dev        iqr
count  22.000000  22.000000
mean    0.254054   0.147727
std     0.277869   0.350672
min     0.000000   0.000000
25%     0.000000   0.000000
50%     0.223762   0.000000
75%     0.426882   0.000000
max     1.005653   1.000000


In [20]:
import pandas as pd
from scipy.stats import ks_2samp, ttest_ind
from itertools import combinations

# 假設 df 已載入，case_code 已建好，且已 rename
# 保證 df 有 ['sentence','judge','court','case_code']

MIN_N = 20
pair_records = []

for code, group in df.groupby('case_code'):
    # 僅留每位法官在此 code 下判決 >= MIN_N 次
    judges_ok = [j for j, g in group.groupby('judge') if len(g) >= MIN_N]
    if len(judges_ok) < 2:
        continue

    for j1, j2 in combinations(judges_ok, 2):
        g1 = group.loc[group['judge']==j1, 'sentence']
        g2 = group.loc[group['judge']==j2, 'sentence']
        # 再次確認
        if len(g1) < MIN_N or len(g2) < MIN_N:
            continue

        ks_p = ks_2samp(g1, g2).pvalue
        t_p = ttest_ind(g1, g2, equal_var=False).pvalue

        # 取出法院
        c1 = group.loc[group['judge']==j1, 'court'].iat[0]
        c2 = group.loc[group['judge']==j2, 'court'].iat[0]

        pair_records.append({
            'case_code': code,
            'judge1': j1, 'n1': len(g1), 'court1': c1,
            'judge2': j2, 'n2': len(g2), 'court2': c2,
            'same_court': (c1 == c2),
            'ks_p': ks_p,
            't_p': t_p
        })

df_pair = pd.DataFrame(pair_records)
df_same_court = df_pair[df_pair['same_court']].copy()
df_diff_court = df_pair[~df_pair['same_court']].copy()

df_same_court.to_csv('judge_vs_same_court.csv', index=False)
df_diff_court.to_csv('judge_vs_diff_court.csv', index=False)

print("同院比較：{} 筆；異院比較：{} 筆".format(len(df_same_court), len(df_diff_court)))


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


同院比較：10 筆；異院比較：6 筆


In [21]:
df_same_court

Unnamed: 0,case_code,judge1,n1,court1,judge2,n2,court2,same_court,ks_p,t_p
0,2-0-0.0-1-0-0-0-2-0-0-0,呂明燕,22,KSDM,李爭春,22,KSDM,True,1.0,
1,2-0-0.0-1-0-0-0-2-0-0-0,呂明燕,22,KSDM,蔡書瑜,27,KSDM,True,0.184775,0.002748
2,2-0-0.0-1-0-0-0-2-0-0-0,呂明燕,22,KSDM,陳中和,32,KSDM,True,1.0,
3,2-0-0.0-1-0-0-0-2-0-0-0,李爭春,22,KSDM,蔡書瑜,27,KSDM,True,0.184775,0.002748
4,2-0-0.0-1-0-0-0-2-0-0-0,李爭春,22,KSDM,陳中和,32,KSDM,True,1.0,
5,2-0-0.0-1-0-0-0-2-0-0-0,蔡書瑜,27,KSDM,陳中和,32,KSDM,True,0.118013,0.002748
8,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,陳嘉瑜,24,PTDM,True,1.0,
9,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,黃紀錄,35,PTDM,True,0.999984,0.160287
10,2-0-1.0-1-0-2-0-2-0-0-0,莊維澤,28,KSDM,陳中和,27,KSDM,True,0.890268,0.283791
15,2-0-1.0-1-0-2-0-2-0-0-0,陳嘉瑜,24,PTDM,黃紀錄,35,PTDM,True,1.0,0.160287


In [22]:
df_diff_court

Unnamed: 0,case_code,judge1,n1,court1,judge2,n2,court2,same_court,ks_p,t_p
6,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,莊維澤,28,KSDM,False,2.68081e-15,1.345558e-08
7,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,陳中和,27,KSDM,False,1.924817e-16,1.171059e-11
11,2-0-1.0-1-0-2-0-2-0-0-0,莊維澤,28,KSDM,陳嘉瑜,24,PTDM,False,1.843358e-10,1.345558e-08
12,2-0-1.0-1-0-2-0-2-0-0-0,莊維澤,28,KSDM,黃紀錄,35,PTDM,False,1.907513e-10,2.945241e-08
13,2-0-1.0-1-0-2-0-2-0-0-0,陳中和,27,KSDM,陳嘉瑜,24,PTDM,False,3.352906e-11,1.171059e-11
14,2-0-1.0-1-0-2-0-2-0-0-0,陳中和,27,KSDM,黃紀錄,35,PTDM,False,3.315906e-11,8.146846e-12


In [23]:
alpha = 0.05
df_pair['consistent'] = (
    (df_pair['ks_p'] >= alpha) &
    ((df_pair['t_p'] >= alpha) | df_pair['t_p'].isna())
)


In [24]:
df_pair

Unnamed: 0,case_code,judge1,n1,court1,judge2,n2,court2,same_court,ks_p,t_p,consistent
0,2-0-0.0-1-0-0-0-2-0-0-0,呂明燕,22,KSDM,李爭春,22,KSDM,True,1.0,,True
1,2-0-0.0-1-0-0-0-2-0-0-0,呂明燕,22,KSDM,蔡書瑜,27,KSDM,True,0.1847746,0.002748245,False
2,2-0-0.0-1-0-0-0-2-0-0-0,呂明燕,22,KSDM,陳中和,32,KSDM,True,1.0,,True
3,2-0-0.0-1-0-0-0-2-0-0-0,李爭春,22,KSDM,蔡書瑜,27,KSDM,True,0.1847746,0.002748245,False
4,2-0-0.0-1-0-0-0-2-0-0-0,李爭春,22,KSDM,陳中和,32,KSDM,True,1.0,,True
5,2-0-0.0-1-0-0-0-2-0-0-0,蔡書瑜,27,KSDM,陳中和,32,KSDM,True,0.1180126,0.002748245,False
6,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,莊維澤,28,KSDM,False,2.68081e-15,1.345558e-08,False
7,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,陳中和,27,KSDM,False,1.924817e-16,1.171059e-11,False
8,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,陳嘉瑜,24,PTDM,True,1.0,,True
9,2-0-1.0-1-0-2-0-2-0-0-0,涂裕洪,61,PTDM,黃紀錄,35,PTDM,True,0.9999839,0.1602866,True


##### 酒駕前案紀錄做分箱

法院

In [25]:
import numpy as np
import pandas as pd

df = data.copy()

# 1. 對「酒駕前案紀錄」做等距 4 段分箱
s = df['酒駕前案紀錄']
vmin, vmax = s.min(), s.max()
bins = np.linspace(vmin, vmax, num=5)  # 5 個邊界，4 段
labels = ['Bin1', 'Bin2', 'Bin3', 'Bin4']
df['prior_record_bin'] = pd.cut(s, bins=bins, labels=labels, include_lowest=True)

print("分箱邊界：", bins)
print("各 Bin 範圍：", df['prior_record_bin'].cat.categories)

# 2. 重建 case_code（用分箱後的 prior_record_bin 取代數值列）
feature_cols = [
    '酒精濃度類別', '是否累犯', 'prior_record_bin', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名、保留必要欄位
df = df.rename(columns={'刑期': 'sentence', '法官': 'judge', '法院': 'court'})
df = df[['sentence', 'judge', 'court', 'case_code'] + feature_cols]

# 最小樣本門檻（每個法院對同一組特徵組合 ≥ 20 筆）
MIN_N = 20
records_court_test = []

# ✅ 對每個法院的每個案件組合進行檢查
for (court, case_code), sub in df.groupby(['court', 'case_code']):
    if len(sub) < MIN_N:
        continue

    sentences = sub['sentence'].values
    n_unique = len(np.unique(sentences))
    std = np.std(sentences)
    q75, q25 = np.percentile(sentences, 75), np.percentile(sentences, 25)
    iqr = q75 - q25

    records_court_test.append({
        'court': court,
        'case_code': case_code,
        'n_total': len(sub),
        'n_unique_sentences': n_unique,
        'std_dev': std,
        'iqr': iqr,
        'fully_consistent': (n_unique == 1)
    })

# 建立結果表
df_court_test = pd.DataFrame(records_court_test)
df_court_test.to_csv('court_consistency_summary.csv', index=False, encoding='UTF-8-Sig')

# ✅ 最終輸出統計摘要
print(f"實驗2-1（法院版本）：共 {len(df_court_test)} 筆檢定結果")
print(f"完全一致（所有刑期相同）：{df_court_test['fully_consistent'].sum()} 筆")
print("刑期唯一值數量分布：")
print(df_court_test['n_unique_sentences'].value_counts().sort_index())
print("\n標準差與 IQR 描述統計：")
print(df_court_test[['std_dev', 'iqr']].describe())


分箱邊界： [ 0.   2.5  5.   7.5 10. ]
各 Bin 範圍： Index(['Bin1', 'Bin2', 'Bin3', 'Bin4'], dtype='object')
實驗2-1（法院版本）：共 147 筆檢定結果
完全一致（所有刑期相同）：0 筆
刑期唯一值數量分布：
n_unique_sentences
2    14
3    49
4    57
5    27
Name: count, dtype: int64

標準差與 IQR 描述統計：
          std_dev         iqr
count  147.000000  147.000000
mean     0.763303    0.967687
std      0.224906    0.589776
min      0.141865    0.000000
25%      0.592066    1.000000
50%      0.808245    1.000000
75%      0.952589    1.000000
max      1.200000    2.000000


法官

In [26]:
import numpy as np
import pandas as pd

df = data.copy()

# 1. 對「酒駕前案紀錄」做等距 4 段分箱
s = df['酒駕前案紀錄']
vmin, vmax = s.min(), s.max()
bins = np.linspace(vmin, vmax, num=5)  # 5 個邊界，4 段
labels = ['Bin1', 'Bin2', 'Bin3', 'Bin4']
df['prior_record_bin'] = pd.cut(s, bins=bins, labels=labels, include_lowest=True)

print("分箱邊界：", bins)
print("各 Bin 範圍：", df['prior_record_bin'].cat.categories)

# 2. 重建 case_code（用分箱後的 prior_record_bin 取代數值列）
feature_cols = [
    '酒精濃度類別', '是否累犯', 'prior_record_bin', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名、保留必要欄位
df = df.rename(columns={'刑期': 'sentence', '法官': 'judge', '法院': 'court'})
df = df[['sentence', 'judge', 'court', 'case_code'] + feature_cols]

# 最小樣本門檻（每位法官對同一組特徵組合 ≥ 20 筆）
MIN_N = 20
records_self_test = []

# ✅ 對每位法官的每個案件組合進行檢查
for (judge, case_code), sub in df.groupby(['judge', 'case_code']):
    if len(sub) < MIN_N:
        continue

    sentences = sub['sentence'].values
    n_unique = len(np.unique(sentences))
    std = np.std(sentences)
    q75, q25 = np.percentile(sentences, 75), np.percentile(sentences, 25)
    iqr = q75 - q25

    records_self_test.append({
        'judge': judge,
        'case_code': case_code,
        'n_total': len(sub),
        'n_unique_sentences': n_unique,
        'std_dev': std,
        'iqr': iqr,
        'fully_consistent': (n_unique == 1)
    })

# 建立結果表
df_self_test = pd.DataFrame(records_self_test)
df_self_test.to_csv('self_consistency_summary.csv', index=False, encoding='UTF-8-Sig')

# ✅ 最終輸出統計摘要
print(f"實驗2-1：共 {len(df_self_test)} 筆檢定結果")
print(f"完全一致（所有刑期相同）：{df_self_test['fully_consistent'].sum()} 筆")
print("刑期唯一值數量分布：")
print(df_self_test['n_unique_sentences'].value_counts().sort_index())
print("\n標準差與 IQR 描述統計：")
print(df_self_test[['std_dev', 'iqr']].describe())


分箱邊界： [ 0.   2.5  5.   7.5 10. ]
各 Bin 範圍： Index(['Bin1', 'Bin2', 'Bin3', 'Bin4'], dtype='object')
實驗2-1：共 44 筆檢定結果
完全一致（所有刑期相同）：5 筆
刑期唯一值數量分布：
n_unique_sentences
1     5
2    11
3    15
4    11
5     2
Name: count, dtype: int64

標準差與 IQR 描述統計：
         std_dev        iqr
count  44.000000  44.000000
mean    0.529953   0.482955
std     0.300160   0.509781
min     0.000000   0.000000
25%     0.393169   0.000000
50%     0.538462   0.250000
75%     0.706143   1.000000
max     1.223901   1.750000


####2-2

##### 酒駕前案原始

法院

In [36]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, ttest_ind
from itertools import combinations

# 假設 df 已經載入
df = data.copy()

# 2. 建立特徵組合碼
feature_cols = [
    '酒精濃度類別', '是否累犯', '酒駕前案紀錄', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名欄位
df = df.rename(columns={'刑期': 'sentence', '法官': 'judge', '法院': 'court'})

# 3. 法院層級實驗 2-2：相同案件特徵下，不同法院的量刑差異檢定
MIN_N_COURT = 20
court_records = []

for code, group in df.groupby('case_code'):
    courts_ok = [c for c, g in group.groupby('court') if len(g) >= MIN_N_COURT]
    if len(courts_ok) < 2:
        continue

    for c1, c2 in combinations(courts_ok, 2):
        s1 = group.loc[group['court'] == c1, 'sentence']
        s2 = group.loc[group['court'] == c2, 'sentence']
        if len(s1) < MIN_N_COURT or len(s2) < MIN_N_COURT:
            continue

        ks_p = ks_2samp(s1, s2).pvalue
        t_p = ttest_ind(s1, s2, equal_var=False).pvalue
        ks_pass = ks_p < 0.05
        t_pass = t_p < 0.05
        both_pass = ks_pass and t_pass

        court_records.append({
            'case_code': code,
            'court1': c1, 'n1': len(s1),
            'court2': c2, 'n2': len(s2),
            'ks_p': ks_p,
            't_p': t_p,
            'ks_pass': ks_pass,
            't_pass': t_pass,
            'both_pass': both_pass
        })

df_court = pd.DataFrame(court_records)
df_court.to_csv('酒駕前案原始_court_vs_court_casecode.csv', index=False, encoding='utf-8-Sig')

# 統計輸出
print(f"實驗2-2（法院版）：共 {len(df_court)} 筆檢定結果")
print("p 值描述統計（KS檢定 與 t檢定）：")
print(df_court[['ks_p', 't_p']].describe())


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


實驗2-2（法院版）：共 60 筆檢定結果
p 值描述統計（KS檢定 與 t檢定）：
               ks_p           t_p
count  6.000000e+01  5.600000e+01
mean   6.853476e-01  3.403078e-01
std    4.074396e-01  2.995103e-01
min    1.757913e-31  7.783381e-35
25%    3.034371e-01  5.848572e-02
50%    9.592389e-01  3.107153e-01
75%    9.999985e-01  5.207373e-01
max    1.000000e+00  9.669650e-01


法官

In [None]:
def plot_judge_pair(
    s1, s2,
    j1_code, j2_code,  # 匿名代號
    c1, c2, case_code,
    ks_p, t_p,
    plot_dir="/content/gdrive/MyDrive/NCHU/論文/實驗/實驗2-2可視化的圖/",
    bw_adjust=0.3,
    xlim=(0, 7),       # ✅ 固定 x 軸範圍
    ylim=(0, 10)      # ✅ 固定 y 軸高度上限
):
    os.makedirs(plot_dir, exist_ok=True)
    plt.figure(figsize=(8, 5))

    def safe_kdeplot(data, label, color):
        if len(np.unique(data)) >= 2:
            sns.kdeplot(data, fill=True, color=color, alpha=0.3,
                        linewidth=2, bw_adjust=bw_adjust, label=label)
        else:
            sns.histplot(data, bins=5, stat="density", color=color,
                         alpha=0.3, label=label + " (hist)")

    label1 = f"{c1} - {j1_code} (n={len(s1)})"
    label2 = f"{c2} - {j2_code} (n={len(s2)})"

    safe_kdeplot(s1, label1, color='blue')
    plt.axvline(s1.mean(), color='blue', linestyle='--', linewidth=1.5, label=f"Mean {j1_code}: {s1.mean():.2f}")

    safe_kdeplot(s2, label2, color='orange')
    plt.axvline(s2.mean(), color='orange', linestyle='--', linewidth=1.5, label=f"Mean {j2_code}: {s2.mean():.2f}")

    plt.title(f"案件特徵： {case_code}\nKS p={ks_p:.3f}, T-test p={t_p:.3f}")
    plt.xlabel("量刑刑期 (月份)")
    plt.ylabel("機率密度")
    plt.legend(loc='upper right')
    plt.tight_layout()

    # ✅ 固定軸範圍
    plt.xlim(*xlim)
    plt.ylim(*ylim)

    plot_filename = f"{plot_dir}/{case_code}_{j1_code}_vs_{j2_code}.png"
    plt.savefig(plot_filename)
    plt.close()
    return plot_filename


In [None]:
import numpy as np
from scipy.stats import ks_2samp, ttest_ind
from itertools import combinations
import matplotlib.pyplot as plt
import os


# === 主體邏輯開始 ===
MIN_N_PAIR = 20
df = data.copy()

feature_cols = [
    '酒精濃度類別', '是否累犯', '酒駕前案紀錄', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)
df = df.rename(columns={'刑期':'sentence','法官':'judge','法院':'court'})

pair_records = []
for code, group in df.groupby('case_code'):
    judges_ok = [j for j in group['judge'].unique() if j in judge_mapping and (group['judge'] == j).sum() >= MIN_N_PAIR]
    if len(judges_ok) < 2:
        continue
    for j1, j2 in combinations(judges_ok, 2):
        if j1 not in judge_mapping or j2 not in judge_mapping:
            continue  # ❗ 只處理有對照表的法官

        g1 = group[group['judge'] == j1]
        g2 = group[group['judge'] == j2]
        s1 = g1['sentence']
        s2 = g2['sentence']
        if len(s1) < MIN_N_PAIR or len(s2) < MIN_N_PAIR:
            continue

        ks_p = ks_2samp(s1, s2).pvalue
        t_p = ttest_ind(s1, s2, equal_var=False).pvalue
        ks_pass = ks_p < 0.05
        t_pass = t_p < 0.05
        both_pass = ks_pass and t_pass
        c1 = g1['court'].iloc[0]
        c2 = g2['court'].iloc[0]

        j1_code = judge_mapping[j1]
        j2_code = judge_mapping[j2]

        plot_file = plot_judge_pair(
            s1, s2,
            j1_code=j1_code, j2_code=j2_code,
            c1=c1, c2=c2,
            case_code=code,
            ks_p=ks_p, t_p=t_p,
        )

        pair_records.append({
            'case_code': code,
            'judge1': j1, 'n1': len(s1), 'court1': c1,  # ✅ 保留真名
            'judge2': j2, 'n2': len(s2), 'court2': c2,
            'judge1_code': j1_code, 'judge2_code': j2_code,  # ✅ 匿名欄位
            'same_court': (c1 == c2),
            'ks_p': ks_p, 't_p': t_p,
            'ks_pass': ks_pass, 't_pass': t_pass,
            'both_pass': both_pass,
            'plot_file': plot_file
        })


# === 輸出結果表格 ===
df_pair = pd.DataFrame(pair_records)
df_pair.to_csv('/content/gdrive/MyDrive/NCHU/論文/實驗/實驗2-2/實驗2-2_法官組對檢定結果.csv', index=False, encoding='utf-8-Sig')
print(f"繪圖完成，共產出 {len(df_pair)} 張圖")


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


繪圖完成，共產出 3 張圖


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import levene, ks_2samp, ttest_ind
from itertools import combinations

# 假設 df 已經載入，並含有欄位：
# '刑期','法官','法院','酒駕前案紀錄',以及其他那些分類特徵

df = data.copy()


# 2. 重建 case_code（用分箱後的 prior_record_bin 取代數值列）
feature_cols = [
    '酒精濃度類別', '是否累犯', '酒駕前案紀錄', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名、保留必要欄位
df = df.rename(columns={'刑期':'sentence','法官':'judge','法院':'court'})

# 4. 實驗2-2：法官間同院／異院比較
MIN_N_PAIR = 20
pair_records = []
for code, group in df.groupby('case_code'):
    judges_ok = [j for j, g in group.groupby('judge') if len(g) >= MIN_N_PAIR]
    if len(judges_ok) < 2:
        continue
    for j1, j2 in combinations(judges_ok, 2):
        s1 = group.loc[group['judge']==j1, 'sentence']
        s2 = group.loc[group['judge']==j2, 'sentence']
        if len(s1)<MIN_N_PAIR or len(s2)<MIN_N_PAIR:
            continue
        ks_p = ks_2samp(s1, s2).pvalue
        t_p = ttest_ind(s1, s2, equal_var=False).pvalue
        ks_pass = ks_p < 0.05
        t_pass = t_p < 0.05
        both_pass = ks_pass and t_pass
        c1 = group.loc[group['judge']==j1, 'court'].iat[0]
        c2 = group.loc[group['judge']==j2, 'court'].iat[0]
        pair_records.append({
            'case_code': code,
            'judge1': j1, 'n1': len(s1), 'court1': c1,
            'judge2': j2, 'n2': len(s2), 'court2': c2,
            'same_court': (c1==c2),
            'ks_p': ks_p, 't_p': t_p,
            'ks_pass': ks_pass, 't_pass': t_pass, 'both_pass': both_pass
        })
df_pair = pd.DataFrame(pair_records)
df_same = df_pair[df_pair['same_court']]
df_diff = df_pair[~df_pair['same_court']]
df_same.to_csv('酒駕前案原始_judge_vs_same_court.csv', index=False, encoding='utf-8-Sig')
df_diff.to_csv('酒駕前案原始_judge_vs_diff_court.csv', index=False, encoding='utf-8-Sig')
print(f"酒駕前案原始紀錄_實驗2-2：同院 {len(df_same)} 筆，異院 {len(df_diff)} 筆")


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


酒駕前案原始紀錄_實驗2-2：同院 10 筆，異院 6 筆


#####分箱以後

法院

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, ttest_ind
from itertools import combinations

# 假設 df 已經載入
df = data.copy()

# 1. 酒駕前案紀錄分箱
s = df['酒駕前案紀錄']
vmin, vmax = s.min(), s.max()
bins = np.linspace(vmin, vmax, num=5)
labels = ['Bin1', 'Bin2', 'Bin3', 'Bin4']
df['prior_record_bin'] = pd.cut(s, bins=bins, labels=labels, include_lowest=True)

print("分箱邊界：", bins)
print("各 Bin 範圍：", df['prior_record_bin'].cat.categories)

# 2. 建立特徵組合碼
feature_cols = [
    '酒精濃度類別', '是否累犯', 'prior_record_bin', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)

# 重命名欄位
df = df.rename(columns={'刑期': 'sentence', '法官': 'judge', '法院': 'court'})
df = df[['sentence', 'judge', 'court', 'case_code'] + feature_cols]

# 3. 法院層級實驗 2-2：相同案件特徵下，不同法院的量刑差異檢定
MIN_N_COURT = 20
court_records = []

for code, group in df.groupby('case_code'):
    courts_ok = [c for c, g in group.groupby('court') if len(g) >= MIN_N_COURT]
    if len(courts_ok) < 2:
        continue

    for c1, c2 in combinations(courts_ok, 2):
        s1 = group.loc[group['court'] == c1, 'sentence']
        s2 = group.loc[group['court'] == c2, 'sentence']
        if len(s1) < MIN_N_COURT or len(s2) < MIN_N_COURT:
            continue

        ks_p = ks_2samp(s1, s2).pvalue
        t_p = ttest_ind(s1, s2, equal_var=False).pvalue

        ks_pass = ks_p < 0.05
        t_pass = t_p < 0.05
        both_pass = ks_pass and t_pass

        court_records.append({
            'case_code': code,
            'court1': c1, 'n1': len(s1),
            'court2': c2, 'n2': len(s2),
            'ks_p': ks_p,
            't_p': t_p,
            'ks_pass': ks_pass,
            't_pass': t_pass,
            'both_pass': both_pass
        })

df_court = pd.DataFrame(court_records)
df_court.to_csv('酒駕前案分箱後_court_vs_court_casecode.csv', index=False, encoding='utf-8-Sig')

# 統計輸出
print(f"實驗2-2（法院版）：共 {len(df_court)} 筆檢定結果")
print("p 值描述統計（KS檢定 與 t檢定）：")
print(df_court[['ks_p', 't_p']].describe())


分箱邊界： [ 0.   2.5  5.   7.5 10. ]
各 Bin 範圍： Index(['Bin1', 'Bin2', 'Bin3', 'Bin4'], dtype='object')
實驗2-2（法院版）：共 150 筆檢定結果
p 值描述統計（KS檢定 與 t檢定）：
               ks_p           t_p
count  1.500000e+02  1.500000e+02
mean   5.014342e-01  3.168061e-01
std    4.071889e-01  3.211430e-01
min    1.308046e-20  4.721006e-34
25%    1.012949e-01  2.149796e-02
50%    4.570524e-01  1.884128e-01
75%    9.700485e-01  6.004178e-01
max    1.000000e+00  1.000000e+00


法官

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import levene, ks_2samp, ttest_ind
from itertools import combinations

# 假設 df 已經載入，並含有欄位：
# '刑期','法官','法院','酒駕前案紀錄',以及其他那些分類特徵

df = data.copy()

# 分箱
s = df['酒駕前案紀錄']
bins = np.linspace(s.min(), s.max(), num=5)
labels = ['Bin1','Bin2','Bin3','Bin4']
df['prior_record_bin'] = pd.cut(s, bins=bins, labels=labels, include_lowest=True)

# 重建 case_code
feature_cols = [
    '酒精濃度類別', '是否累犯', 'prior_record_bin', '是否坦承犯行',
    '有無出現交通事故', '有無乘客', '是否為職業駕駛',
    '交通工具', '家庭經濟', '教育程度', '除酒駕以外的交通違規'
]
df['case_code'] = df[feature_cols].astype(str).agg('-'.join, axis=1)
df = df.rename(columns={'刑期':'sentence','法官':'judge','法院':'court'})
df = df[['sentence','judge','court','case_code'] + feature_cols]

# 開始逐案檢定
MIN_N_PAIR = 20
pair_records = []
for code, group in df.groupby('case_code'):
    judges_ok = [j for j in group['judge'].unique() if j in judge_mapping and (group['judge'] == j).sum() >= MIN_N_PAIR]
    if len(judges_ok) < 2:
        continue
    for j1, j2 in combinations(judges_ok, 2):
        g1 = group[group['judge'] == j1]
        g2 = group[group['judge'] == j2]
        s1 = g1['sentence']
        s2 = g2['sentence']
        ks_p = ks_2samp(s1, s2).pvalue
        t_p = ttest_ind(s1, s2, equal_var=False).pvalue
        ks_pass = ks_p < 0.05
        t_pass = t_p < 0.05
        both_pass = ks_pass and t_pass
        c1 = g1['court'].iloc[0]
        c2 = g2['court'].iloc[0]
        j1_code = judge_mapping[j1]
        j2_code = judge_mapping[j2]
        plot_file = plot_judge_pair(
            s1, s2,
            j1_code=j1_code, j2_code=j2_code,
            c1=c1, c2=c2,
            case_code=code,
            ks_p=ks_p, t_p=t_p
        )
        pair_records.append({
            'case_code': code,
            'judge1': j1, 'n1': len(s1), 'court1': c1,
            'judge2': j2, 'n2': len(s2), 'court2': c2,
            'judge1_code': j1_code, 'judge2_code': j2_code,
            'same_court': (c1 == c2),
            'ks_p': ks_p, 't_p': t_p,
            'ks_pass': ks_pass, 't_pass': t_pass, 'both_pass': both_pass,
            'plot_file': plot_file
        })

df_pair = pd.DataFrame(pair_records)
df_same = df_pair[df_pair['same_court']]
df_diff = df_pair[~df_pair['same_court']]
df_same.to_csv('/content/gdrive/MyDrive/NCHU/論文/實驗/實驗2-2/酒駕前案分箱後_judge_vs_same_court.csv', index=False, encoding='utf-8-Sig')
df_diff.to_csv('/content/gdrive/MyDrive/NCHU/論文/實驗/實驗2-2/酒駕前案分箱後_judge_vs_diff_court.csv', index=False, encoding='utf-8-Sig')
print(f"實驗2-2：同院 {len(df_same)} 筆，異院 {len(df_diff)} 筆")


實驗2-2：同院 17 筆，異院 11 筆


In [None]:
df_self

Unnamed: 0,judge,case_code,n,n_unique,consistent_self
0,吳俞玲,2-0-Bin1-1-0-0-0-2-2-3-0,20,2,False
1,呂明燕,2-0-Bin1-1-0-0-0-2-0-0-0,30,3,False
2,張嘉芬,2-0-Bin1-1-0-0-0-2-2-2-0,25,2,False
3,張嘉芬,2-0-Bin1-1-0-0-0-2-2-3-0,36,2,False
4,張嘉芬,2-0-Bin1-1-0-0-0-2-3-3-0,24,2,False
5,徐蘭萍,2-0-Bin1-1-0-0-0-0-0-0-0,22,5,False
6,施添寶,2-0-Bin1-1-0-0-0-2-1-0-0,21,1,True
7,曾正龍,2-0-Bin1-1-0-2-2-0-0-0-0,24,4,False
8,曾正龍,2-0-Bin1-1-0-2-2-2-0-0-0,46,3,False
9,曾正龍,3-0-Bin1-1-0-2-2-2-0-0-0,20,3,False


In [None]:
df_same

Unnamed: 0,case_code,judge1,n1,court1,judge2,n2,court2,same_court,ks_p,t_p
0,2-0-Bin1-1-0-0-0-2-0-0-0,呂明燕,30,KSDM,李爭春,46,KSDM,True,0.383213,0.0269649
1,2-0-Bin1-1-0-0-0-2-0-0-0,呂明燕,30,KSDM,胡家瑋,25,KSDM,True,0.008054,0.003096756
2,2-0-Bin1-1-0-0-0-2-0-0-0,呂明燕,30,KSDM,莊珮君,21,KSDM,True,2.4e-05,0.0007908306
3,2-0-Bin1-1-0-0-0-2-0-0-0,呂明燕,30,KSDM,蔡書瑜,59,KSDM,True,3.1e-05,7.73942e-07
4,2-0-Bin1-1-0-0-0-2-0-0-0,呂明燕,30,KSDM,陳中和,48,KSDM,True,0.99795,0.3757495
6,2-0-Bin1-1-0-0-0-2-0-0-0,李爭春,46,KSDM,胡家瑋,25,KSDM,True,0.298149,0.8141112
7,2-0-Bin1-1-0-0-0-2-0-0-0,李爭春,46,KSDM,莊珮君,21,KSDM,True,0.004874,0.0810251
8,2-0-Bin1-1-0-0-0-2-0-0-0,李爭春,46,KSDM,蔡書瑜,59,KSDM,True,0.010896,0.1020294
9,2-0-Bin1-1-0-0-0-2-0-0-0,李爭春,46,KSDM,陳中和,48,KSDM,True,0.833053,0.1005396
11,2-0-Bin1-1-0-0-0-2-0-0-0,胡家瑋,25,KSDM,莊珮君,21,KSDM,True,0.607859,0.08889719


In [None]:
df_diff

Unnamed: 0,case_code,judge1,n1,court1,judge2,n2,court2,same_court,ks_p,t_p
5,2-0-Bin1-1-0-0-0-2-0-0-0,呂明燕,30,KSDM,陳嘉瑜,26,PTDM,False,0.7568024,0.03135169
10,2-0-Bin1-1-0-0-0-2-0-0-0,李爭春,46,KSDM,陳嘉瑜,26,PTDM,False,0.01509836,0.0005133846
14,2-0-Bin1-1-0-0-0-2-0-0-0,胡家瑋,25,KSDM,陳嘉瑜,26,PTDM,False,0.000103063,1.436163e-05
17,2-0-Bin1-1-0-0-0-2-0-0-0,莊珮君,21,KSDM,陳嘉瑜,26,PTDM,False,2.842072e-08,8.076443e-05
19,2-0-Bin1-1-0-0-0-2-0-0-0,蔡書瑜,59,KSDM,陳嘉瑜,26,PTDM,False,1.572999e-08,8.079946e-12
20,2-0-Bin1-1-0-0-0-2-0-0-0,陳中和,48,KSDM,陳嘉瑜,26,PTDM,False,0.1996635,0.001130507
21,2-0-Bin1-1-0-0-0-2-2-3-0,吳俞玲,20,KSDM,張嘉芬,36,SLEM,False,0.0007012944,0.0001240274
22,2-0-Bin1-1-0-0-0-2-2-3-0,吳俞玲,20,KSDM,楊博欽,29,CTDM,False,1.0,0.5532491
24,2-0-Bin1-1-0-0-0-2-2-3-0,吳俞玲,20,KSDM,鄭琬薇,22,PTDM,False,0.004469314,0.003859668
25,2-0-Bin1-1-0-0-0-2-2-3-0,張嘉芬,36,SLEM,楊博欽,29,CTDM,False,3.680521e-05,2.015854e-05
