In [None]:
# ライブラリのインポート
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# データセットの読み込み
train_file_path = '/kaggle/input/lmsysdataset/train.csv'
test_file_path = '/kaggle/input/lmsysdataset/test.csv'

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

# ターゲット変数の作成
train_df['target'] = train_df.apply(lambda row: 1 if row['winner_model_a'] == 1 else 0, axis=1)

# テキストデータの結合
train_df['text_a'] = train_df['prompt'] + ' ' + train_df['response_a']
train_df['text_b'] = train_df['prompt'] + ' ' + train_df['response_b']
test_df['text_a'] = test_df['prompt'] + ' ' + test_df['response_a']
test_df['text_b'] = test_df['prompt'] + ' ' + test_df['response_b']

# TF-IDFベクトライザの設定
vectorizer = TfidfVectorizer(max_features=5000)

# テキストデータのベクトル化
X_a = vectorizer.fit_transform(train_df['text_a'])
X_b = vectorizer.fit_transform(train_df['text_b'])
X_test_a = vectorizer.transform(test_df['text_a'])
X_test_b = vectorizer.transform(test_df['text_b'])

# 訓練データと検証データの分割
X_train_a, X_valid_a, y_train, y_valid = train_test_split(X_a, train_df['target'], test_size=0.2, random_state=42)
X_train_b, X_valid_b, _, _ = train_test_split(X_b, train_df['target'], test_size=0.2, random_state=42)

# ロジスティック回帰モデルの設定
model_a = LogisticRegression(max_iter=1000)
model_b = LogisticRegression(max_iter=1000)

# モデルの訓練
model_a.fit(X_train_a, y_train)
model_b.fit(X_train_b, y_train)

# 検証データの予測
valid_preds_a = model_a.predict_proba(X_valid_a)[:, 1]
valid_preds_b = model_b.predict_proba(X_valid_b)[:, 1]

# ログ損失の計算
loss_a = log_loss(y_valid, valid_preds_a)
loss_b = log_loss(y_valid, valid_preds_b)

print(f'Log Loss for model_a: {loss_a}')
print(f'Log Loss for model_b: {loss_b}')

# テストデータの予測
test_preds_a = model_a.predict_proba(X_test_a)[:, 1]
test_preds_b = model_b.predict_proba(X_test_b)[:, 1]

# 提出ファイルの作成
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': test_preds_a,
    'winner_model_b': test_preds_b,
    'winner_tie': 0.0  # 同点の場合は0とします（必要に応じて調整）
})

# 提出ファイルの保存
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file created successfully!")
