In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings

plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')

REG_PATH = os.path.join(DATA_DIR, 'regression_dataset_final.csv')
MET_PATH = os.path.join(DATA_DIR, 'performance_metrics_final.csv')
TUNE_PATH = os.path.join(DATA_DIR, 'tuning_results_final.csv')

print("데이터 로드 중")
df_reg = pd.read_csv(REG_PATH)
df_met = pd.read_csv(MET_PATH)
df_tune = pd.read_csv(TUNE_PATH)

print(f"회귀 데이터셋 크기: {len(df_reg)}")
print(f"성능 메트릭 크기: {len(df_met)}")

feature_cols = ['doc_length', 'query_length', 'query_avg_token_len',
                'query_unique_ratio', 'query_match_count', 'query_match_ratio',
                'dominant_topic', 'dominant_prob', 'search_score']

In [None]:
print("\n[기본 통계량]")
print(df_reg[feature_cols + ['relevance']].describe())

def run_logit_weighted(model_name, subset_df, w_doc=1.0, w_query=1.0, w_topic=1.0):
    X = subset_df[feature_cols].copy()

    X['doc_length'] *= w_doc
    X['query_length'] *= w_query
    X['query_avg_token_len'] *= w_query
    X['query_unique_ratio'] *= w_query
    X['query_match_count'] *= w_query
    X['query_match_ratio'] *= w_query
    X['dominant_topic'] *= w_topic
    X['dominant_prob'] *= w_topic

    y = subset_df['relevance']
    X = sm.add_constant(X)

    try:
        model = sm.Logit(y, X)
        result = model.fit(disp=0, maxiter=100)

        df_res = pd.DataFrame({
            'Model': model_name,
            'Feature': result.params.index,
            'Coefficient': result.params.values,
            'Odds_Ratio': np.exp(result.params.values),
            'P_value': result.pvalues.values,
            'Lower_CI': np.exp(result.conf_int()[0].values),
            'Upper_CI': np.exp(result.conf_int()[1].values)
        })

        pseudo_r2 = result.prsquared
        aic = result.aic

        return df_res, pseudo_r2, aic
    except Exception as e:
        print(f"오류 발생 in {model_name}: {e}")
        return pd.DataFrame(), 0, 0

In [None]:
print("\n[1단계: 기본 회귀 분석 (가중치 1.0, 1.0, 1.0)]")
res_bim, r2_bim, aic_bim = run_logit_weighted("BIM", df_reg[df_reg['model'] == 'BIM'])
res_bm25, r2_bm25, aic_bm25 = run_logit_weighted("BM25", df_reg[df_reg['model'] == 'BM25_Best'])

print(f"BIM - Pseudo R2: {r2_bim:.4f}, AIC: {aic_bim:.2f}")
print(f"BM25 - Pseudo R2: {r2_bm25:.4f}, AIC: {aic_bm25:.2f}")

baseline_results = pd.concat([res_bim, res_bm25], ignore_index=True)
baseline_results = baseline_results[baseline_results['Feature'] != 'const']

In [None]:
print("\n[2단계: 가중치 튜닝 실험]")
weight_configs = [
    {'name': 'baseline', 'w_doc': 1.0, 'w_query': 1.0, 'w_topic': 1.0},
    {'name': 'doc_focus', 'w_doc': 2.0, 'w_query': 1.0, 'w_topic': 0.5},
    {'name': 'query_focus', 'w_doc': 0.5, 'w_query': 2.0, 'w_topic': 0.5},
    {'name': 'topic_focus', 'w_doc': 0.5, 'w_query': 0.5, 'w_topic': 2.0},
    {'name': 'doc_query', 'w_doc': 1.5, 'w_query': 1.5, 'w_topic': 0.5},
]

weight_tuning_results = []

for config in tqdm(weight_configs, desc="가중치 튜닝"):
    for model_type in ['BIM', 'BM25_Best']:
        subset = df_reg[df_reg['model'] == model_type]
        _, r2, aic = run_logit_weighted(
            f"{model_type}_{config['name']}",
            subset,
            w_doc=config['w_doc'],
            w_query=config['w_query'],
            w_topic=config['w_topic']
        )

        weight_tuning_results.append({
            'model': model_type,
            'config': config['name'],
            'w_doc': config['w_doc'],
            'w_query': config['w_query'],
            'w_topic': config['w_topic'],
            'pseudo_r2': r2,
            'aic': aic
        })

df_weight_tuning = pd.DataFrame(weight_tuning_results)
print("\n[가중치 튜닝 결과]")
print(df_weight_tuning.to_string(index=False))

best_config_bim = df_weight_tuning[df_weight_tuning['model'] == 'BIM'].nlargest(1, 'pseudo_r2').iloc[0]
best_config_bm25 = df_weight_tuning[df_weight_tuning['model'] == 'BM25_Best'].nlargest(1, 'pseudo_r2').iloc[0]

print(f"\nBIM 최적 가중치: doc={best_config_bim['w_doc']}, query={best_config_bim['w_query']}, topic={best_config_bim['w_topic']}")
print(f"BM25 최적 가중치: doc={best_config_bm25['w_doc']}, query={best_config_bm25['w_query']}, topic={best_config_bm25['w_topic']}")

In [None]:
print("\n[3단계: 종합 성능 평가]")
map_bim = df_met[df_met['model'] == 'BIM']['AP'].mean()
map_bm25 = df_met[df_met['model'] == 'BM25_Best']['AP'].mean()

p10_bim = df_met[df_met['model'] == 'BIM']['P@10'].mean()
p10_bm25 = df_met[df_met['model'] == 'BM25_Best']['P@10'].mean()

r10_bim = df_met[df_met['model'] == 'BIM']['R@10'].mean()
r10_bm25 = df_met[df_met['model'] == 'BM25_Best']['R@10'].mean()

or_doc_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                              (baseline_results['Feature'] == 'doc_length')]['Odds_Ratio'].values[0]
or_doc_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                               (baseline_results['Feature'] == 'doc_length')]['Odds_Ratio'].values[0]

or_query_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                                (baseline_results['Feature'] == 'query_length')]['Odds_Ratio'].values[0]
or_query_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                                 (baseline_results['Feature'] == 'query_length')]['Odds_Ratio'].values[0]

or_topic_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                                (baseline_results['Feature'] == 'dominant_topic')]['Odds_Ratio'].values[0]
or_topic_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                                 (baseline_results['Feature'] == 'dominant_topic')]['Odds_Ratio'].values[0]

comprehensive_results = pd.DataFrame({
    'Model': ['BIM', 'BM25_Best'],
    'MAP': [map_bim, map_bm25],
    'P@10': [p10_bim, p10_bm25],
    'R@10': [r10_bim, r10_bm25],
    'Pseudo_R2': [r2_bim, r2_bm25],
    'AIC': [aic_bim, aic_bm25],
    'doc_length_OR': [or_doc_bim, or_doc_bm25],
    'query_length_OR': [or_query_bim, or_query_bm25],
    'dominant_topic_OR': [or_topic_bim, or_topic_bm25],
    'best_w_doc': [best_config_bim['w_doc'], best_config_bm25['w_doc']],
    'best_w_query': [best_config_bim['w_query'], best_config_bm25['w_query']],
    'best_w_topic': [best_config_bim['w_topic'], best_config_bm25['w_topic']]
})

print("\n[종합 결과 테이블]")
print(comprehensive_results.to_string(index=False))

In [None]:
print("\n[4단계: 결과 저장]")
baseline_results.to_csv(os.path.join(DATA_DIR, 'odds_ratio_results_final.csv'), index=False)
df_weight_tuning.to_csv(os.path.join(DATA_DIR, 'weight_tuning_results_final.csv'), index=False)
comprehensive_results.to_csv(os.path.join(DATA_DIR, 'comprehensive_results_final.csv'), index=False)

In [None]:
print("\n[5단계: 시각화]")

def plot_forest_comparison(data, feature_name, title, xlabel):
    subset = data[data['Feature'] == feature_name]
    if len(subset) == 0:
        print(f"경고: {feature_name}에 대한 데이터 없음")
        return

    plt.figure(figsize=(10, 3))
    y_pos = range(len(subset))

    err = [subset['Odds_Ratio'] - subset['Lower_CI'], subset['Upper_CI'] - subset['Odds_Ratio']]

    colors = ['blue' if m == 'BIM' else 'red' for m in subset['Model']]

    plt.errorbar(subset['Odds_Ratio'], y_pos, xerr=err, fmt='o', capsize=5, color='black')
    plt.scatter(subset['Odds_Ratio'], y_pos, c=colors, s=100, zorder=3)

    plt.axvline(x=1.0, color='gray', linestyle='--', linewidth=1)

    plt.yticks(y_pos, subset['Model'])
    plt.xlabel(xlabel)
    plt.title(title, fontsize=14, fontweight='bold')

    for i, (_, row) in enumerate(subset.iterrows()):
        plt.text(row['Odds_Ratio'], i - 0.3, f"OR: {row['Odds_Ratio']:.4f}\n(p={row['P_value']:.3f})",
                 ha='center', va='top', fontsize=9)

    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, f'{feature_name}_forest_plot_final.png'), dpi=300)
    plt.show()

print("a1: 문서 길이 영향력 시각화")
plot_forest_comparison(baseline_results, 'doc_length',
                       'a1: 문서 길이 Odds Ratio (BIM vs BM25)',
                       'Odds Ratio (1.0보다 작으면 길이가 길수록 불리함)')

print("a2: 쿼리 길이 영향력 시각화")
plot_forest_comparison(baseline_results, 'query_length',
                       'a2: 쿼리 길이 Odds Ratio (질의어가 길수록 유리한가)',
                       'Odds Ratio (1.0보다 크면 쿼리가 길수록 성공 확률 증가)')

print("a3: 도메인 토픽 영향력 시각화")
plot_forest_comparison(baseline_results, 'dominant_topic',
                       'a3: 도미넌트 토픽 Odds Ratio (주제별 검색 성공 확률)',
                       'Odds Ratio (영향력)')

plt.figure(figsize=(12, 6))
x = np.arange(len(comprehensive_results))
width = 0.35

plt.subplot(1, 2, 1)
plt.bar(x - width/2, comprehensive_results['MAP'], width, label='MAP', alpha=0.8)
plt.bar(x + width/2, comprehensive_results['P@10'], width, label='P@10', alpha=0.8)
plt.xlabel('모델')
plt.ylabel('점수')
plt.title('검색 성능 비교 (MAP vs P@10)')
plt.xticks(x, comprehensive_results['Model'])
plt.legend()
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.subplot(1, 2, 2)
plt.bar(x, comprehensive_results['Pseudo_R2'], alpha=0.8, color=['blue', 'red'])
plt.xlabel('모델')
plt.ylabel('Pseudo R-squared')
plt.title('회귀 모델 설명력')
plt.xticks(x, comprehensive_results['Model'])
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'performance_comparison_final.png'), dpi=300)
plt.show()

plt.figure(figsize=(10, 6))
pivot_weight = df_weight_tuning.pivot_table(
    index='config',
    columns='model',
    values='pseudo_r2'
)
sns.heatmap(pivot_weight, annot=True, fmt=".4f", cmap="YlGnBu", linewidths=.5)
plt.title('가중치 설정별 Pseudo R-squared 히트맵')
plt.xlabel('모델')
plt.ylabel('가중치 설정')
plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'weight_tuning_heatmap_final.png'), dpi=300)
plt.show()

important_features = baseline_results[baseline_results['Feature'].isin(['doc_length', 'query_length', 'dominant_topic'])]
plt.figure(figsize=(10, 6))
sns.barplot(x='Feature', y='Odds_Ratio', hue='Model', data=important_features, palette=['#4c72b0', '#c44e52'])
plt.axhline(y=1.0, color='black', linestyle='--', linewidth=1)
plt.title('주요 변수별 Odds Ratio 비교 (a1, a2, a3)')
plt.xlabel('변수')
plt.ylabel('Odds Ratio')
plt.legend(title='모델')
plt.grid(axis='y', linestyle=':', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'main_features_comparison_final.png'), dpi=300)
plt.show()

In [None]:
print("\n작업 완료")
print(f"저장된 파일:")
print(f"  - odds_ratio_results_final.csv")
print(f"  - weight_tuning_results_final.csv")
print(f"  - comprehensive_results_final.csv")
print(f"  - 시각화 PNG 파일들")