# Deep Past Challenge - Akkadian to English Translation Baseline

TF-IDF nearest neighbor approach: find the most similar Akkadian text in training data and use its English translation.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DATA_DIR = Path('/kaggle/input/competitions/deep-past-initiative-machine-translation')

train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
sample_sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')

print('train columns:', train.columns.tolist(), train.shape)
print('test columns:', test.columns.tolist(), test.shape)
print('submission columns:', sample_sub.columns.tolist(), sample_sub.shape)
print()
print('train head:')
print(train.head(2).to_string())
print()
print('test head:')
print(test.head(2).to_string())

In [None]:
# Detect columns
# Submission: id, translation
# Test: id + source text column(s)
# Train: id + source text column(s) + translation

id_col = 'id'
target_col = 'translation'  # from sample_submission

# Source column: in test but not id or translation
test_text_cols = [c for c in test.columns if c not in [id_col, target_col]]
print(f'Test text columns (source): {test_text_cols}')

# If multiple source columns, concatenate them
if len(test_text_cols) == 1:
    source_col = test_text_cols[0]
    train['_source'] = train[source_col].fillna('')
    test['_source'] = test[source_col].fillna('')
else:
    # Concatenate all source columns
    train['_source'] = train[test_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
    test['_source'] = test[test_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)

train[target_col] = train[target_col].fillna('')

print(f'Source sample: {train["_source"].iloc[0][:100]}')
print(f'Target sample: {train[target_col].iloc[0][:100]}')

In [None]:
# TF-IDF on source text (character n-grams for non-Latin scripts)
vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 5),
    max_features=50000,
    sublinear_tf=True
)

train_tfidf = vectorizer.fit_transform(train['_source'])
test_tfidf = vectorizer.transform(test['_source'])

print(f'Train TF-IDF: {train_tfidf.shape}')
print(f'Test TF-IDF: {test_tfidf.shape}')

In [None]:
# Nearest neighbor prediction
BATCH_SIZE = 100
predictions = []

for i in range(0, test_tfidf.shape[0], BATCH_SIZE):
    batch = test_tfidf[i:i + BATCH_SIZE]
    sims = cosine_similarity(batch, train_tfidf)
    best_idx = sims.argmax(axis=1)
    for idx in best_idx:
        predictions.append(train[target_col].iloc[idx])

print(f'Predictions: {len(predictions)}')
print(f'Sample: {predictions[0][:100]}')

In [None]:
# Create submission
submission = pd.DataFrame({
    id_col: test[id_col],
    target_col: predictions
})
submission[target_col] = submission[target_col].fillna('unknown')

print(submission.head())
submission.to_csv('/kaggle/working/submission.csv', index=False)
print(f'Saved submission.csv ({submission.shape})')