In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from xgboost import XGBClassifier
from scipy.optimize import linear_sum_assignment
from sklearn.pipeline import Pipeline


In [2]:
# Load training data
train = pd.read_csv('/kaggle/input/problem-3-republic-olymp-tst-homework/train.csv')
train['label'] = 1
# Generate incorrect conclusions
incorrect_conclusions = train['conclusion'].sample(frac=1, random_state=42).reset_index(drop=True)
incorrect_data = train.copy()
incorrect_data['conclusion'] = incorrect_conclusions
incorrect_data['label'] = 0
# Combine datasets
train = pd.concat([train, incorrect_data], ignore_index=True)

In [3]:
# Combine description and conclusion
train['combined'] = '[DESCRIPTION] ' + train['description'] + ' [CONCLUSION] ' + train['conclusion']
X_texts = train['combined']
y = train['label']

In [4]:
# Build pipeline with TF-IDF and XGBoost
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])
X_train_texts, X_test_texts, y_train, y_test = train_test_split(X_texts, y, test_size=0.2, random_state=42)
pipeline.fit(X_train_texts, y_train)
# Predict and evaluate
probs = pipeline.predict_proba(X_test_texts)[:, 1]
preds = (probs > 0.5).astype(int)
print('AUC:', roc_auc_score(y_test, probs))
print('Accuracy:', accuracy_score(y_test, preds))
print('F1 Score:', f1_score(y_test, preds))

AUC: 0.9568725910969886
Accuracy: 0.8966101694915254
F1 Score: 0.9010061668289515


In [5]:
def predict_relationship_batch(descriptions, conclusions):
    combined = [f"[DESCRIPTION] {desc} [CONCLUSION] {conc}" for desc in descriptions for conc in conclusions]
    probs = pipeline.predict_proba(combined)[:, 1]
    return probs.reshape((len(descriptions), len(conclusions)))

In [6]:
# Load test data
testconc = pd.read_csv('/kaggle/input/problem-3-republic-olymp-tst-homework/test_conclusions.csv')
testdesc = pd.read_csv('/kaggle/input/problem-3-republic-olymp-tst-homework/test_descriptions.csv')
anses = []
for num in range(len(testconc) // 10):
    st, en = num * 10, num * 10 + 10
    conc = testconc['conclusion'].iloc[st:en].tolist()
    desc = testdesc['description'].iloc[st:en].tolist()
    score_matrix = predict_relationship_batch(desc, conc)
    cost_matrix = -score_matrix
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    for j in range(10):
        anses.append(col_ind[j] + st + 1)
    print(f'Batch {num} complete')
anses

Batch 0 complete
Batch 1 complete
Batch 2 complete
Batch 3 complete
Batch 4 complete
Batch 5 complete
Batch 6 complete
Batch 7 complete
Batch 8 complete
Batch 9 complete
Batch 10 complete
Batch 11 complete
Batch 12 complete
Batch 13 complete
Batch 14 complete
Batch 15 complete
Batch 16 complete
Batch 17 complete
Batch 18 complete
Batch 19 complete
Batch 20 complete
Batch 21 complete
Batch 22 complete
Batch 23 complete
Batch 24 complete
Batch 25 complete
Batch 26 complete
Batch 27 complete
Batch 28 complete
Batch 29 complete
Batch 30 complete
Batch 31 complete
Batch 32 complete
Batch 33 complete
Batch 34 complete
Batch 35 complete
Batch 36 complete
Batch 37 complete
Batch 38 complete
Batch 39 complete
Batch 40 complete
Batch 41 complete
Batch 42 complete
Batch 43 complete
Batch 44 complete
Batch 45 complete
Batch 46 complete
Batch 47 complete
Batch 48 complete
Batch 49 complete
Batch 50 complete
Batch 51 complete
Batch 52 complete
Batch 53 complete
Batch 54 complete
Batch 55 complete
Ba

[4,
 1,
 10,
 2,
 8,
 6,
 3,
 9,
 7,
 5,
 13,
 16,
 14,
 12,
 15,
 17,
 18,
 11,
 20,
 19,
 26,
 23,
 29,
 22,
 24,
 21,
 30,
 27,
 28,
 25,
 40,
 38,
 35,
 39,
 31,
 32,
 36,
 33,
 34,
 37,
 50,
 46,
 41,
 48,
 44,
 43,
 45,
 47,
 42,
 49,
 60,
 52,
 54,
 55,
 58,
 59,
 56,
 57,
 51,
 53,
 67,
 64,
 65,
 61,
 69,
 70,
 66,
 68,
 62,
 63,
 73,
 80,
 74,
 71,
 72,
 78,
 77,
 79,
 75,
 76,
 87,
 81,
 84,
 89,
 83,
 82,
 88,
 90,
 86,
 85,
 96,
 99,
 94,
 100,
 93,
 95,
 97,
 98,
 91,
 92,
 107,
 106,
 110,
 103,
 109,
 108,
 105,
 101,
 104,
 102,
 116,
 120,
 112,
 111,
 119,
 115,
 114,
 117,
 118,
 113,
 126,
 124,
 123,
 127,
 122,
 129,
 128,
 130,
 121,
 125,
 139,
 132,
 137,
 133,
 131,
 134,
 135,
 140,
 136,
 138,
 143,
 148,
 149,
 145,
 150,
 142,
 146,
 141,
 144,
 147,
 153,
 159,
 155,
 151,
 154,
 157,
 158,
 160,
 152,
 156,
 168,
 162,
 166,
 164,
 170,
 161,
 167,
 165,
 163,
 169,
 178,
 172,
 171,
 176,
 179,
 173,
 177,
 180,
 174,
 175,
 185,
 190,
 188,
 189,
 183

In [7]:
ansdesc = []
ansconc = []
for i in range(7370):
    x = str(i + 1)
    while len(x) < 5:
        x = '0' + x
    y = str(anses[i])
    while len(y) < 5:
        y = '0' + y
    ansdesc.append(x)
    ansconc.append('conc_' + y)
sub = pd.read_csv('/kaggle/input/problem-3-republic-olymp-tst-homework/sample_submission.csv')
sub['conclusion_id'] = ansconc
sub.to_csv('submission.csv', index=False)