In [2]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings

plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'data_final')

REG_PATH = os.path.join(DATA_DIR, 'regression_dataset_final.csv')
MET_PATH = os.path.join(DATA_DIR, 'performance_metrics_final.csv')
TUNE_PATH = os.path.join(DATA_DIR, 'tuning_extended_final.csv')

print("데이터 로드 중")
df_reg = pd.read_csv(REG_PATH)
df_met = pd.read_csv(MET_PATH)
df_tune = pd.read_csv(TUNE_PATH)

print(f"회귀 데이터셋 크기: {len(df_reg)}")
print(f"성능 메트릭 크기: {len(df_met)}")

feature_cols = ['doc_length', 'query_length', 'query_avg_token_len',
                'query_unique_ratio', 'query_match_count', 'query_match_ratio',
                'dominant_topic', 'dominant_prob', 'search_score']

데이터 로드 중
회귀 데이터셋 크기: 33018
성능 메트릭 크기: 2908


In [3]:
print("\n[기본 통계량]")
print(df_reg[feature_cols + ['relevance']].describe())

def run_logit_weighted(model_name, subset_df, w_doc=1.0, w_query=1.0, w_topic=1.0):
    X = subset_df[feature_cols].copy()

    X['doc_length'] *= w_doc
    X['query_length'] *= w_query
    X['query_avg_token_len'] *= w_query
    X['query_unique_ratio'] *= w_query
    X['query_match_count'] *= w_query
    X['query_match_ratio'] *= w_query
    X['dominant_topic'] *= w_topic
    X['dominant_prob'] *= w_topic

    y = subset_df['relevance']
    X = sm.add_constant(X)

    try:
        model = sm.Logit(y, X)
        result = model.fit(disp=0, maxiter=100)

        df_res = pd.DataFrame({
            'Model': model_name,
            'Feature': result.params.index,
            'Coefficient': result.params.values,
            'Odds_Ratio': np.exp(result.params.values),
            'P_value': result.pvalues.values,
            'Lower_CI': np.exp(result.conf_int()[0].values),
            'Upper_CI': np.exp(result.conf_int()[1].values)
        })

        pseudo_r2 = result.prsquared
        aic = result.aic

        return df_res, pseudo_r2, aic
    except Exception as e:
        print(f"오류 발생 in {model_name}: {e}")
        return pd.DataFrame(), 0, 0


[기본 통계량]
          doc_length  query_length  query_avg_token_len  query_unique_ratio  \
count   33018.000000  33018.000000         33018.000000        33018.000000   
mean    17045.838997     13.782361             2.090508            0.965778   
std     19572.926121      7.231495             0.456305            0.054532   
min       272.000000      1.000000             1.000000            0.611111   
25%      2883.000000      8.000000             1.800000            0.937500   
50%      9929.000000     13.000000             2.000000            1.000000   
75%     24135.000000     18.000000             2.230769            1.000000   
max    102419.000000     39.000000             5.500000            1.000000   

       query_match_count  query_match_ratio  dominant_topic  dominant_prob  \
count       33018.000000       33018.000000    33018.000000   33018.000000   
mean            8.711036           0.637061        3.818614       0.707661   
std             5.486535           0.207237 

In [4]:
print("\n[1단계: 기본 회귀 분석 (가중치 1.0, 1.0, 1.0)]")
res_bim, r2_bim, aic_bim = run_logit_weighted("BIM", df_reg[df_reg['model'] == 'BIM'])
res_bm25, r2_bm25, aic_bm25 = run_logit_weighted("BM25", df_reg[df_reg['model'] == 'BM25_Best'])

print(f"BIM - Pseudo R2: {r2_bim:.4f}, AIC: {aic_bim:.2f}")
print(f"BM25 - Pseudo R2: {r2_bm25:.4f}, AIC: {aic_bm25:.2f}")

baseline_results = pd.concat([res_bim, res_bm25], ignore_index=True)
baseline_results = baseline_results[baseline_results['Feature'] != 'const']


[1단계: 기본 회귀 분석 (가중치 1.0, 1.0, 1.0)]
BIM - Pseudo R2: 0.4442, AIC: 11640.70
BM25 - Pseudo R2: 0.1708, AIC: 16457.92


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score

print("\n[2단계: 가중치 튜닝 실험 - 수정]")

feature_groups = {
    'doc': ['doc_length'],
    'query': ['query_length', 'query_avg_token_len', 'query_unique_ratio',
              'query_match_count', 'query_match_ratio'],
    'topic': ['dominant_topic', 'dominant_prob'],
    'score': ['search_score']
}

weight_configs = [
    {'name': 'baseline', 'w_doc': 1.0, 'w_query': 1.0, 'w_topic': 1.0, 'w_score': 1.0},

    {'name': 'doc_strong', 'w_doc': 3.0, 'w_query': 1.0, 'w_topic': 0.5, 'w_score': 1.0},
    {'name': 'doc_focus', 'w_doc': 2.0, 'w_query': 1.0, 'w_topic': 0.5, 'w_score': 1.0},
    {'name': 'doc_moderate', 'w_doc': 1.5, 'w_query': 1.0, 'w_topic': 0.8, 'w_score': 1.0},

    {'name': 'query_strong', 'w_doc': 0.5, 'w_query': 3.0, 'w_topic': 0.5, 'w_score': 1.0},
    {'name': 'query_focus', 'w_doc': 0.5, 'w_query': 2.0, 'w_topic': 0.5, 'w_score': 1.0},
    {'name': 'query_moderate', 'w_doc': 0.8, 'w_query': 1.5, 'w_topic': 0.8, 'w_score': 1.0},

    {'name': 'topic_strong', 'w_doc': 0.5, 'w_query': 0.5, 'w_topic': 3.0, 'w_score': 1.0},
    {'name': 'topic_focus', 'w_doc': 0.5, 'w_query': 0.5, 'w_topic': 2.0, 'w_score': 1.0},
    {'name': 'topic_moderate', 'w_doc': 0.8, 'w_query': 0.8, 'w_topic': 1.5, 'w_score': 1.0},

    {'name': 'score_strong', 'w_doc': 0.3, 'w_query': 0.3, 'w_topic': 0.3, 'w_score': 3.0},
    {'name': 'score_focus', 'w_doc': 0.5, 'w_query': 0.5, 'w_topic': 0.5, 'w_score': 2.0},

    {'name': 'doc_query_strong', 'w_doc': 2.0, 'w_query': 2.0, 'w_topic': 0.5, 'w_score': 1.0},
    {'name': 'doc_query', 'w_doc': 1.5, 'w_query': 1.5, 'w_topic': 0.5, 'w_score': 1.0},

    {'name': 'doc_topic', 'w_doc': 1.5, 'w_query': 0.5, 'w_topic': 1.5, 'w_score': 1.0},
    {'name': 'query_topic', 'w_doc': 0.5, 'w_query': 1.5, 'w_topic': 1.5, 'w_score': 1.0},

    {'name': 'all_boost', 'w_doc': 1.5, 'w_query': 1.5, 'w_topic': 1.5, 'w_score': 1.5},
]

print(f"총 {len(weight_configs)}개 가중치 조합 테스트")

def apply_feature_weights(X, weights):
    X_weighted = X.copy()
    for group, cols in feature_groups.items():
        weight_key = f'w_{group}'
        if weight_key in weights:
            for col in cols:
                if col in X_weighted.columns:
                    X_weighted[col] = X_weighted[col] * weights[weight_key]
    return X_weighted

weight_tuning_results = []

for config in tqdm(weight_configs, desc="가중치 튜닝"):
    for model_type in ['BIM', 'BM25_Best']:
        subset = df_reg[df_reg['model'] == model_type].copy()

        X = subset[feature_cols].copy()
        y = subset['relevance']

        X_weighted = apply_feature_weights(X, config)

        model = LogisticRegression(max_iter=200, random_state=42, solver='lbfgs')
        model.fit(X_weighted, y)

        y_pred_proba = model.predict_proba(X_weighted)[:, 1]

        ll = log_loss(y, y_pred_proba)

        try:
            auc = roc_auc_score(y, y_pred_proba)
        except:
            auc = 0.0

        y_pred = model.predict(X_weighted)
        acc = accuracy_score(y, y_pred)

        weight_tuning_results.append({
            'model': model_type,
            'config': config['name'],
            'w_doc': config['w_doc'],
            'w_query': config['w_query'],
            'w_topic': config['w_topic'],
            'w_score': config.get('w_score', 1.0),
            'log_loss': ll,
            'auc': auc,
            'accuracy': acc
        })

df_weight_tuning = pd.DataFrame(weight_tuning_results)
print("\n[가중치 튜닝 결과 - 상위 10개 (AUC 기준)]")
print(df_weight_tuning.nlargest(10, 'auc')[['model', 'config', 'w_doc', 'w_query', 'w_topic', 'w_score', 'auc', 'accuracy']].to_string(index=False))

print("\n[BIM 최적 설정 (AUC 기준)]")
best_bim = df_weight_tuning[df_weight_tuning['model'] == 'BIM'].nlargest(1, 'auc').iloc[0]
print(f"설정: {best_bim['config']}")
print(f"  w_doc={best_bim['w_doc']}, w_query={best_bim['w_query']}, w_topic={best_bim['w_topic']}, w_score={best_bim['w_score']}")
print(f"  AUC={best_bim['auc']:.4f}, Accuracy={best_bim['accuracy']:.4f}, Log Loss={best_bim['log_loss']:.4f}")

print("\n[BM25 최적 설정 (AUC 기준)]")
best_bm25 = df_weight_tuning[df_weight_tuning['model'] == 'BM25_Best'].nlargest(1, 'auc').iloc[0]
print(f"설정: {best_bm25['config']}")
print(f"  w_doc={best_bm25['w_doc']}, w_query={best_bm25['w_query']}, w_topic={best_bm25['w_topic']}, w_score={best_bm25['w_score']}")
print(f"  AUC={best_bm25['auc']:.4f}, Accuracy={best_bm25['accuracy']:.4f}, Log Loss={best_bm25['log_loss']:.4f}")


[2단계: 가중치 튜닝 실험 - 수정]
총 17개 가중치 조합 테스트


가중치 튜닝: 100%|██████████| 17/17 [00:05<00:00,  3.13it/s]


[가중치 튜닝 결과 - 상위 10개 (AUC 기준)]
model         config  w_doc  w_query  w_topic  w_score      auc  accuracy
  BIM    query_focus    0.5      2.0      0.5      1.0 0.915028  0.865634
  BIM    score_focus    0.5      0.5      0.5      2.0 0.914950  0.864417
  BIM    query_topic    0.5      1.5      1.5      1.0 0.914642  0.864533
  BIM topic_moderate    0.8      0.8      1.5      1.0 0.914363  0.865402
  BIM   query_strong    0.5      3.0      0.5      1.0 0.914346  0.865750
  BIM    topic_focus    0.5      0.5      2.0      1.0 0.913967  0.864069
  BIM      doc_query    1.5      1.5      0.5      1.0 0.913723  0.866446
  BIM   topic_strong    0.5      0.5      3.0      1.0 0.913708  0.864649
  BIM   score_strong    0.3      0.3      0.3      3.0 0.913555  0.865113
  BIM      all_boost    1.5      1.5      1.5      1.5 0.913521  0.865170

[BIM 최적 설정 (AUC 기준)]
설정: query_focus
  w_doc=0.5, w_query=2.0, w_topic=0.5, w_score=1.0
  AUC=0.9150, Accuracy=0.8656, Log Loss=0.3373

[BM25 최적 설정 (AUC 기




In [8]:
print("\n[3단계: 종합 성능 평가]")
map_bim = df_met[df_met['model'] == 'BIM']['AP'].mean()
map_bm25 = df_met[df_met['model'] == 'BM25_Best']['AP'].mean()

p10_bim = df_met[df_met['model'] == 'BIM']['P@10'].mean()
p10_bm25 = df_met[df_met['model'] == 'BM25_Best']['P@10'].mean()

r10_bim = df_met[df_met['model'] == 'BIM']['R@10'].mean()
r10_bm25 = df_met[df_met['model'] == 'BM25_Best']['R@10'].mean()

or_doc_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                              (baseline_results['Feature'] == 'doc_length')]['Odds_Ratio'].values[0]
or_doc_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                               (baseline_results['Feature'] == 'doc_length')]['Odds_Ratio'].values[0]

or_query_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                                (baseline_results['Feature'] == 'query_length')]['Odds_Ratio'].values[0]
or_query_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                                 (baseline_results['Feature'] == 'query_length')]['Odds_Ratio'].values[0]

or_topic_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                                (baseline_results['Feature'] == 'dominant_topic')]['Odds_Ratio'].values[0]
or_topic_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                                 (baseline_results['Feature'] == 'dominant_topic')]['Odds_Ratio'].values[0]

best_config_bim = df_weight_tuning[df_weight_tuning['model'] == 'BIM'].nlargest(1, 'auc').iloc[0]
best_config_bm25 = df_weight_tuning[df_weight_tuning['model'] == 'BM25_Best'].nlargest(1, 'auc').iloc[0]

comprehensive_results = pd.DataFrame({
    'Model': ['BIM', 'BM25_Best'],
    'MAP': [map_bim, map_bm25],
    'P@10': [p10_bim, p10_bm25],
    'R@10': [r10_bim, r10_bm25],
    'Pseudo_R2': [r2_bim, r2_bm25],
    'AIC': [aic_bim, aic_bm25],
    'best_AUC': [best_config_bim['auc'], best_config_bm25['auc']],
    'doc_length_OR': [or_doc_bim, or_doc_bm25],
    'query_length_OR': [or_query_bim, or_query_bm25],
    'dominant_topic_OR': [or_topic_bim, or_topic_bm25],
    'best_w_doc': [best_config_bim['w_doc'], best_config_bm25['w_doc']],
    'best_w_query': [best_config_bim['w_query'], best_config_bm25['w_query']],
    'best_w_topic': [best_config_bim['w_topic'], best_config_bm25['w_topic']],
    'best_w_score': [best_config_bim['w_score'], best_config_bm25['w_score']],
    'best_config': [best_config_bim['config'], best_config_bm25['config']]
})

print("\n[종합 결과 테이블]")
print(comprehensive_results.to_string(index=False))


[3단계: 종합 성능 평가]

[종합 결과 테이블]
    Model      MAP     P@10     R@10  Pseudo_R2          AIC  best_AUC  doc_length_OR  query_length_OR  dominant_topic_OR  best_w_doc  best_w_query  best_w_topic  best_w_score best_config
      BIM 0.356822 0.163411 0.461161   0.444176 11640.697645  0.915028       0.999855         1.081038           1.008743         0.5           2.0           0.5           1.0 query_focus
BM25_Best 0.623864 0.264512 0.663011   0.170845 16457.918606  0.741106       1.000008         0.930547           0.996718         0.5           1.5           1.5           1.0 query_topic


In [None]:
print("\n[4단계: 결과 저장]")
baseline_results.to_csv(os.path.join(DATA_DIR, 'odds_ratio_results_final.csv'), index=False)
df_weight_tuning.to_csv(os.path.join(DATA_DIR, 'weight_tuning_results_final.csv'), index=False)
comprehensive_results.to_csv(os.path.join(DATA_DIR, 'comprehensive_results_final.csv'), index=False)

In [None]:
print("\n[5단계: 시각화]")

def plot_forest_comparison(data, feature_name, title, xlabel):
    subset = data[data['Feature'] == feature_name]
    if len(subset) == 0:
        print(f"경고: {feature_name}에 대한 데이터 없음")
        return

    plt.figure(figsize=(10, 3))
    y_pos = range(len(subset))

    err = [subset['Odds_Ratio'] - subset['Lower_CI'], subset['Upper_CI'] - subset['Odds_Ratio']]

    colors = ['blue' if m == 'BIM' else 'red' for m in subset['Model']]

    plt.errorbar(subset['Odds_Ratio'], y_pos, xerr=err, fmt='o', capsize=5, color='black')
    plt.scatter(subset['Odds_Ratio'], y_pos, c=colors, s=100, zorder=3)

    plt.axvline(x=1.0, color='gray', linestyle='--', linewidth=1)

    plt.yticks(y_pos, subset['Model'])
    plt.xlabel(xlabel)
    plt.title(title, fontsize=14, fontweight='bold')

    for i, (_, row) in enumerate(subset.iterrows()):
        plt.text(row['Odds_Ratio'], i - 0.3, f"OR: {row['Odds_Ratio']:.4f}\n(p={row['P_value']:.3f})",
                 ha='center', va='top', fontsize=9)

    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, f'{feature_name}_forest_plot_final.png'), dpi=300)
    plt.show()

print("a1: 문서 길이 영향력 시각화")
plot_forest_comparison(baseline_results, 'doc_length',
                       'a1: 문서 길이 Odds Ratio (BIM vs BM25)',
                       'Odds Ratio (1.0보다 작으면 길이가 길수록 불리함)')

print("a2: 쿼리 길이 영향력 시각화")
plot_forest_comparison(baseline_results, 'query_length',
                       'a2: 쿼리 길이 Odds Ratio (질의어가 길수록 유리한가)',
                       'Odds Ratio (1.0보다 크면 쿼리가 길수록 성공 확률 증가)')

print("a3: 도메인 토픽 영향력 시각화")
plot_forest_comparison(baseline_results, 'dominant_topic',
                       'a3: 도미넌트 토픽 Odds Ratio (주제별 검색 성공 확률)',
                       'Odds Ratio (영향력)')

plt.figure(figsize=(12, 6))
x = np.arange(len(comprehensive_results))
width = 0.35

plt.subplot(1, 2, 1)
plt.bar(x - width/2, comprehensive_results['MAP'], width, label='MAP', alpha=0.8)
plt.bar(x + width/2, comprehensive_results['P@10'], width, label='P@10', alpha=0.8)
plt.xlabel('모델')
plt.ylabel('점수')
plt.title('검색 성능 비교 (MAP vs P@10)')
plt.xticks(x, comprehensive_results['Model'])
plt.legend()
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.subplot(1, 2, 2)
plt.bar(x, comprehensive_results['Pseudo_R2'], alpha=0.8, color=['blue', 'red'])
plt.xlabel('모델')
plt.ylabel('Pseudo R-squared')
plt.title('회귀 모델 설명력')
plt.xticks(x, comprehensive_results['Model'])
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'performance_comparison_final.png'), dpi=300)
plt.show()

plt.figure(figsize=(10, 6))
pivot_weight = df_weight_tuning.pivot_table(
    index='config',
    columns='model',
    values='pseudo_r2'
)
sns.heatmap(pivot_weight, annot=True, fmt=".4f", cmap="YlGnBu", linewidths=.5)
plt.title('가중치 설정별 Pseudo R-squared 히트맵')
plt.xlabel('모델')
plt.ylabel('가중치 설정')
plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'weight_tuning_heatmap_final.png'), dpi=300)
plt.show()

important_features = baseline_results[baseline_results['Feature'].isin(['doc_length', 'query_length', 'dominant_topic'])]
plt.figure(figsize=(10, 6))
sns.barplot(x='Feature', y='Odds_Ratio', hue='Model', data=important_features, palette=['#4c72b0', '#c44e52'])
plt.axhline(y=1.0, color='black', linestyle='--', linewidth=1)
plt.title('주요 변수별 Odds Ratio 비교 (a1, a2, a3)')
plt.xlabel('변수')
plt.ylabel('Odds Ratio')
plt.legend(title='모델')
plt.grid(axis='y', linestyle=':', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'main_features_comparison_final.png'), dpi=300)
plt.show()

In [None]:
print("\n작업 완료")
print(f"저장된 파일:")
print(f"  - odds_ratio_results_final.csv")
print(f"  - weight_tuning_results_final.csv")
print(f"  - comprehensive_results_final.csv")
print(f"  - 시각화 PNG 파일들")

---

In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings

plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'data_final')

REG_PATH = os.path.join(DATA_DIR, 'regression_dataset_final.csv')
MET_PATH = os.path.join(DATA_DIR, 'performance_metrics_final.csv')
TUNE_PATH = os.path.join(DATA_DIR, 'tuning_extended_final.csv')

print("데이터 로드")
df_reg = pd.read_csv(REG_PATH)
df_met = pd.read_csv(MET_PATH)
df_tune = pd.read_csv(TUNE_PATH)

print(f"회귀 데이터셋 크기: {len(df_reg)}")
print(f"성능 메트릭 크기: {len(df_met)}")

feature_cols = ['doc_length', 'query_length', 'query_avg_token_len',
                'query_unique_ratio', 'query_match_count', 'query_match_ratio',
                'dominant_topic', 'dominant_prob']

데이터 로드
회귀 데이터셋 크기: 33016
성능 메트릭 크기: 2908


In [2]:
print("\n[기본 통계량]")
print(df_reg[feature_cols + ['relevance']].describe())

def run_logit_weighted(model_name, subset_df, w_doc=1.0, w_query=1.0, w_topic=1.0):
    X = subset_df[feature_cols].copy()

    X['doc_length'] *= w_doc
    X['query_length'] *= w_query
    X['query_avg_token_len'] *= w_query
    X['query_unique_ratio'] *= w_query
    X['query_match_count'] *= w_query
    X['query_match_ratio'] *= w_query
    X['dominant_topic'] *= w_topic
    X['dominant_prob'] *= w_topic

    y = subset_df['relevance']
    X = sm.add_constant(X)

    try:
        model = sm.Logit(y, X)
        result = model.fit(disp=0, maxiter=100)

        df_res = pd.DataFrame({
            'Model': model_name,
            'Feature': result.params.index,
            'Coefficient': result.params.values,
            'Odds_Ratio': np.exp(result.params.values),
            'P_value': result.pvalues.values,
            'Lower_CI': np.exp(result.conf_int()[0].values),
            'Upper_CI': np.exp(result.conf_int()[1].values)
        })

        pseudo_r2 = result.prsquared
        aic = result.aic

        return df_res, pseudo_r2, aic
    except Exception as e:
        print(f"오류 발생 in {model_name}: {e}")
        return pd.DataFrame(), 0, 0


[기본 통계량]
          doc_length  query_length  query_avg_token_len  query_unique_ratio  \
count   33016.000000  33016.000000         33016.000000        33016.000000   
mean    17010.184062     13.781500             2.090540            0.965777   
std     19579.955658      7.231487             0.456304            0.054532   
min       272.000000      1.000000             1.000000            0.611111   
25%      2856.250000      8.000000             1.800000            0.937500   
50%      9887.000000     13.000000             2.000000            1.000000   
75%     24092.000000     18.000000             2.230769            1.000000   
max    102419.000000     39.000000             5.500000            1.000000   

       query_match_count  query_match_ratio  dominant_topic  dominant_prob  \
count       33016.000000       33016.000000    33016.000000   33016.000000   
mean            8.695693           0.635985        3.820935       0.707676   
std             5.487826           0.207944 

In [3]:
print("\n[1단계: 기본 회귀 분석 (가중치 1.0, 1.0, 1.0)]")
res_bim, r2_bim, aic_bim = run_logit_weighted("BIM", df_reg[df_reg['model'] == 'BIM'])
res_bm25, r2_bm25, aic_bm25 = run_logit_weighted("BM25", df_reg[df_reg['model'] == 'BM25_Best'])

print(f"BIM - Pseudo R2: {r2_bim:.4f}, AIC: {aic_bim:.2f}")
print(f"BM25 - Pseudo R2: {r2_bm25:.4f}, AIC: {aic_bm25:.2f}")

baseline_results = pd.concat([res_bim, res_bm25], ignore_index=True)
baseline_results = baseline_results[baseline_results['Feature'] != 'const']

print("\n[주요 변수 Odds Ratio]")
for feature in ['doc_length', 'query_length', 'dominant_topic']:
    print(f"\n{feature}:")
    for model in ['BIM', 'BM25']:
        row = baseline_results[(baseline_results['Model'] == model) &
                               (baseline_results['Feature'] == feature)]
        if len(row) > 0:
            or_val = row['Odds_Ratio'].values[0]
            p_val = row['P_value'].values[0]
            print(f"  {model}: OR={or_val:.6f}, p={p_val:.4f}")


[1단계: 기본 회귀 분석 (가중치 1.0, 1.0, 1.0)]
BIM - Pseudo R2: 0.4343, AIC: 11844.26
BM25 - Pseudo R2: 0.0437, AIC: 18975.00

[주요 변수 Odds Ratio]

doc_length:
  BIM: OR=0.999850, p=0.0000
  BM25: OR=0.999975, p=0.0000

query_length:
  BIM: OR=1.097917, p=0.0000
  BM25: OR=0.982297, p=0.0233

dominant_topic:
  BIM: OR=1.007168, p=0.4508
  BM25: OR=0.987435, p=0.0823


In [4]:
print("\n[2단계: 가중치 튜닝 실험]")

feature_groups = {
    'doc': ['doc_length'],
    'query': ['query_length', 'query_avg_token_len', 'query_unique_ratio',
              'query_match_count', 'query_match_ratio'],
    'topic': ['dominant_topic', 'dominant_prob']
}

weight_configs = [
    {'name': 'baseline', 'w_doc': 1.0, 'w_query': 1.0, 'w_topic': 1.0},

    {'name': 'doc_strong', 'w_doc': 3.0, 'w_query': 1.0, 'w_topic': 0.5},
    {'name': 'doc_focus', 'w_doc': 2.0, 'w_query': 1.0, 'w_topic': 0.5},
    {'name': 'doc_moderate', 'w_doc': 1.5, 'w_query': 1.0, 'w_topic': 0.8},

    {'name': 'query_strong', 'w_doc': 0.5, 'w_query': 3.0, 'w_topic': 0.5},
    {'name': 'query_focus', 'w_doc': 0.5, 'w_query': 2.0, 'w_topic': 0.5},
    {'name': 'query_moderate', 'w_doc': 0.8, 'w_query': 1.5, 'w_topic': 0.8},

    {'name': 'topic_strong', 'w_doc': 0.5, 'w_query': 0.5, 'w_topic': 3.0},
    {'name': 'topic_focus', 'w_doc': 0.5, 'w_query': 0.5, 'w_topic': 2.0},
    {'name': 'topic_moderate', 'w_doc': 0.8, 'w_query': 0.8, 'w_topic': 1.5},

    {'name': 'doc_query_strong', 'w_doc': 2.0, 'w_query': 2.0, 'w_topic': 0.5},
    {'name': 'doc_query', 'w_doc': 1.5, 'w_query': 1.5, 'w_topic': 0.5},

    {'name': 'doc_topic', 'w_doc': 1.5, 'w_query': 0.5, 'w_topic': 1.5},
    {'name': 'query_topic', 'w_doc': 0.5, 'w_query': 1.5, 'w_topic': 1.5},

    {'name': 'all_boost', 'w_doc': 1.5, 'w_query': 1.5, 'w_topic': 1.5},
]

print(f"총 {len(weight_configs)}개 가중치 조합 테스트")

def apply_feature_weights(X, weights):
    X_weighted = X.copy()
    for group, cols in feature_groups.items():
        weight_key = f'w_{group}'
        if weight_key in weights:
            for col in cols:
                if col in X_weighted.columns:
                    X_weighted[col] = X_weighted[col] * weights[weight_key]
    return X_weighted

weight_tuning_results = []

for config in tqdm(weight_configs, desc="가중치 튜닝"):
    for model_type in ['BIM', 'BM25_Best']:
        subset = df_reg[df_reg['model'] == model_type].copy()

        X = subset[feature_cols].copy()
        y = subset['relevance']

        X_weighted = apply_feature_weights(X, config)

        model = LogisticRegression(max_iter=200, random_state=42, solver='lbfgs')
        model.fit(X_weighted, y)

        y_pred_proba = model.predict_proba(X_weighted)[:, 1]

        ll = log_loss(y, y_pred_proba)

        try:
            auc = roc_auc_score(y, y_pred_proba)
        except:
            auc = 0.0

        y_pred = model.predict(X_weighted)
        acc = accuracy_score(y, y_pred)

        weight_tuning_results.append({
            'model': model_type,
            'config': config['name'],
            'w_doc': config['w_doc'],
            'w_query': config['w_query'],
            'w_topic': config['w_topic'],
            'log_loss': ll,
            'auc': auc,
            'accuracy': acc
        })

df_weight_tuning = pd.DataFrame(weight_tuning_results)
print("\n[가중치 튜닝 결과 - 상위 10개 (AUC 기준)]")
print(df_weight_tuning.nlargest(10, 'auc')[['model', 'config', 'w_doc', 'w_query', 'w_topic', 'auc', 'accuracy']].to_string(index=False))

best_config_bim = df_weight_tuning[df_weight_tuning['model'] == 'BIM'].nlargest(1, 'auc').iloc[0]
best_config_bm25 = df_weight_tuning[df_weight_tuning['model'] == 'BM25_Best'].nlargest(1, 'auc').iloc[0]

print(f"\nBIM 최적 가중치: doc={best_config_bim['w_doc']}, query={best_config_bim['w_query']}, topic={best_config_bim['w_topic']}")
print(f"BM25 최적 가중치: doc={best_config_bm25['w_doc']}, query={best_config_bm25['w_query']}, topic={best_config_bm25['w_topic']}")


[2단계: 가중치 튜닝 실험]
총 15개 가중치 조합 테스트


가중치 튜닝: 100%|██████████| 15/15 [00:04<00:00,  3.33it/s]


[가중치 튜닝 결과 - 상위 10개 (AUC 기준)]
model         config  w_doc  w_query  w_topic      auc  accuracy
  BIM query_moderate    0.8      1.5      0.8 0.911978  0.864359
  BIM   topic_strong    0.5      0.5      3.0 0.911887  0.863953
  BIM      all_boost    1.5      1.5      1.5 0.911835  0.862851
  BIM       baseline    1.0      1.0      1.0 0.911830  0.862909
  BIM   doc_moderate    1.5      1.0      0.8 0.911830  0.863605
  BIM    query_topic    0.5      1.5      1.5 0.911605  0.865228
  BIM    topic_focus    0.5      0.5      2.0 0.911600  0.863605
  BIM   query_strong    0.5      3.0      0.5 0.911593  0.864765
  BIM      doc_focus    2.0      1.0      0.5 0.911569  0.864243
  BIM     doc_strong    3.0      1.0      0.5 0.911469  0.864533

BIM 최적 가중치: doc=0.8, query=1.5, topic=0.8
BM25 최적 가중치: doc=0.5, query=2.0, topic=0.5





In [5]:
print("\n[3단계: 종합 성능 평가]")
map_bim = df_met[df_met['model'] == 'BIM']['AP'].mean()
map_bm25 = df_met[df_met['model'] == 'BM25_Best']['AP'].mean()

p10_bim = df_met[df_met['model'] == 'BIM']['P@10'].mean()
p10_bm25 = df_met[df_met['model'] == 'BM25_Best']['P@10'].mean()

r10_bim = df_met[df_met['model'] == 'BIM']['R@10'].mean()
r10_bm25 = df_met[df_met['model'] == 'BM25_Best']['R@10'].mean()

or_doc_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                              (baseline_results['Feature'] == 'doc_length')]['Odds_Ratio'].values[0]
or_doc_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                               (baseline_results['Feature'] == 'doc_length')]['Odds_Ratio'].values[0]

or_query_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                                (baseline_results['Feature'] == 'query_length')]['Odds_Ratio'].values[0]
or_query_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                                 (baseline_results['Feature'] == 'query_length')]['Odds_Ratio'].values[0]

or_topic_bim = baseline_results[(baseline_results['Model'] == 'BIM') &
                                (baseline_results['Feature'] == 'dominant_topic')]['Odds_Ratio'].values[0]
or_topic_bm25 = baseline_results[(baseline_results['Model'] == 'BM25') &
                                 (baseline_results['Feature'] == 'dominant_topic')]['Odds_Ratio'].values[0]

comprehensive_results = pd.DataFrame({
    'Model': ['BIM', 'BM25_Best'],
    'MAP': [map_bim, map_bm25],
    'P@10': [p10_bim, p10_bm25],
    'R@10': [r10_bim, r10_bm25],
    'Pseudo_R2': [r2_bim, r2_bm25],
    'AIC': [aic_bim, aic_bm25],
    'best_AUC': [best_config_bim['auc'], best_config_bm25['auc']],
    'doc_length_OR': [or_doc_bim, or_doc_bm25],
    'query_length_OR': [or_query_bim, or_query_bm25],
    'dominant_topic_OR': [or_topic_bim, or_topic_bm25],
    'best_w_doc': [best_config_bim['w_doc'], best_config_bm25['w_doc']],
    'best_w_query': [best_config_bim['w_query'], best_config_bm25['w_query']],
    'best_w_topic': [best_config_bim['w_topic'], best_config_bm25['w_topic']],
    'best_config': [best_config_bim['config'], best_config_bm25['config']]
})

print("\n[종합 결과 테이블]")
print(comprehensive_results.to_string(index=False))


[3단계: 종합 성능 평가]

[종합 결과 테이블]
    Model      MAP     P@10     R@10  Pseudo_R2          AIC  best_AUC  doc_length_OR  query_length_OR  dominant_topic_OR  best_w_doc  best_w_query  best_w_topic    best_config
      BIM 0.356822 0.163411 0.461161   0.434344 11844.260894  0.911978       0.999850         1.097917           1.007168         0.8           1.5           0.8 query_moderate
BM25_Best 0.624658 0.264649 0.663221   0.043703 18974.997944  0.636501       0.999975         0.982297           0.987435         0.5           2.0           0.5    query_focus


In [None]:
print("\n[4단계: 결과 저장]")
baseline_results.to_csv(os.path.join(DATA_DIR, 'odds_ratio_results_final.csv'), index=False)
df_weight_tuning.to_csv(os.path.join(DATA_DIR, 'weight_tuning_results_final.csv'), index=False)
comprehensive_results.to_csv(os.path.join(DATA_DIR, 'comprehensive_results_final.csv'), index=False)

In [None]:
print("\n[5단계: 시각화]")

def plot_forest_comparison(data, feature_name, title, xlabel):
    subset = data[data['Feature'] == feature_name]
    if len(subset) == 0:
        print(f"경고: {feature_name}에 대한 데이터 없음")
        return

    plt.figure(figsize=(10, 3))
    y_pos = range(len(subset))

    err = [subset['Odds_Ratio'] - subset['Lower_CI'], subset['Upper_CI'] - subset['Odds_Ratio']]

    colors = ['blue' if m == 'BIM' else 'red' for m in subset['Model']]

    plt.errorbar(subset['Odds_Ratio'], y_pos, xerr=err, fmt='o', capsize=5, color='black')
    plt.scatter(subset['Odds_Ratio'], y_pos, c=colors, s=100, zorder=3)

    plt.axvline(x=1.0, color='gray', linestyle='--', linewidth=1)

    plt.yticks(y_pos, subset['Model'])
    plt.xlabel(xlabel)
    plt.title(title, fontsize=14, fontweight='bold')

    for i, (_, row) in enumerate(subset.iterrows()):
        plt.text(row['Odds_Ratio'], i - 0.3, f"OR: {row['Odds_Ratio']:.4f}\n(p={row['P_value']:.3f})",
                 ha='center', va='top', fontsize=9)

    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, f'{feature_name}_forest_plot_final.png'), dpi=300)
    plt.show()

print("a1: 문서 길이 영향력 시각화")
plot_forest_comparison(baseline_results, 'doc_length',
                       'a1: 문서 길이 Odds Ratio (BIM vs BM25)',
                       'Odds Ratio (1.0보다 작으면 길이가 길수록 불리함)')

print("a2: 쿼리 길이 영향력 시각화")
plot_forest_comparison(baseline_results, 'query_length',
                       'a2: 쿼리 길이 Odds Ratio (질의어가 길수록 유리한가)',
                       'Odds Ratio (1.0보다 크면 쿼리가 길수록 성공 확률 증가)')

print("a3: 도메인 토픽 영향력 시각화")
plot_forest_comparison(baseline_results, 'dominant_topic',
                       'a3: 도미넌트 토픽 Odds Ratio (주제별 검색 성공 확률)',
                       'Odds Ratio (영향력)')

plt.figure(figsize=(12, 6))
x = np.arange(len(comprehensive_results))
width = 0.35

plt.subplot(1, 2, 1)
plt.bar(x - width/2, comprehensive_results['MAP'], width, label='MAP', alpha=0.8)
plt.bar(x + width/2, comprehensive_results['P@10'], width, label='P@10', alpha=0.8)
plt.xlabel('모델')
plt.ylabel('점수')
plt.title('검색 성능 비교 (MAP vs P@10)')
plt.xticks(x, comprehensive_results['Model'])
plt.legend()
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.subplot(1, 2, 2)
plt.bar(x, comprehensive_results['Pseudo_R2'], alpha=0.8, color=['blue', 'red'])
plt.xlabel('모델')
plt.ylabel('Pseudo R-squared')
plt.title('회귀 모델 설명력')
plt.xticks(x, comprehensive_results['Model'])
plt.grid(axis='y', linestyle=':', alpha=0.7)

plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'performance_comparison_final.png'), dpi=300)
plt.show()

plt.figure(figsize=(10, 6))
pivot_weight = df_weight_tuning.pivot_table(
    index='config',
    columns='model',
    values='auc'
)
sns.heatmap(pivot_weight, annot=True, fmt=".4f", cmap="YlGnBu", linewidths=.5)
plt.title('가중치 설정별 AUC 히트맵')
plt.xlabel('모델')
plt.ylabel('가중치 설정')
plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'weight_tuning_heatmap_final.png'), dpi=300)
plt.show()

important_features = baseline_results[baseline_results['Feature'].isin(['doc_length', 'query_length', 'dominant_topic'])]
plt.figure(figsize=(10, 6))
sns.barplot(x='Feature', y='Odds_Ratio', hue='Model', data=important_features, palette=['#4c72b0', '#c44e52'])
plt.axhline(y=1.0, color='black', linestyle='--', linewidth=1)
plt.title('주요 변수별 Odds Ratio 비교 (a1, a2, a3)')
plt.xlabel('변수')
plt.ylabel('Odds Ratio')
plt.legend(title='모델')
plt.grid(axis='y', linestyle=':', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(DATA_DIR, 'main_features_comparison_final.png'), dpi=300)
plt.show()

print("\n작업 완료")
print(f"저장된 파일:")
print(f"  - odds_ratio_results_final.csv")
print(f"  - weight_tuning_results_final.csv")
print(f"  - comprehensive_results_final.csv")
print(f"  - 시각화 PNG 파일 4개")

In [7]:
print("\n[다중공선성 진단]")
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_test = df_reg[df_reg['model'] == 'BIM'][feature_cols].copy()
X_test = X_test.dropna()

vif_data = pd.DataFrame()
vif_data["Feature"] = feature_cols
vif_data["VIF"] = [variance_inflation_factor(X_test.values, i) for i in range(len(feature_cols))]
vif_data = vif_data.sort_values('VIF', ascending=False)

print(vif_data)
print("\nVIF > 10이면 심각한 다중공선성")


[다중공선성 진단]
               Feature         VIF
3   query_unique_ratio  101.640988
4    query_match_count   85.764727
1         query_length   81.861196
5    query_match_ratio   62.290909
2  query_avg_token_len   25.764601
7        dominant_prob   13.945969
6       dominant_topic    3.478408
0           doc_length    2.990859

VIF > 10이면 심각한 다중공선성
