# Deep Past Challenge - Akkadian to English Translation Baseline

TF-IDF nearest neighbor approach: find the most similar Akkadian text in training data and use its English translation.

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Debug: list /kaggle/input
print('Contents of /kaggle/input:')
for item in sorted(Path('/kaggle/input').iterdir()):
    print(f'  {item.name}/')
    for sub in sorted(item.iterdir()):
        print(f'    {sub.name} ({sub.stat().st_size:,} bytes)')

DATA_DIR = Path('/kaggle/input/deep-past-initiative-machine-translation')

# Load data
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
sample_sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')

print('\n=== train ===')
print(train.shape)
print(train.columns.tolist())
print(train.head(3))
print()
print('=== test ===')
print(test.shape)
print(test.columns.tolist())
print(test.head(3))
print()
print('=== sample_submission ===')
print(sample_sub.shape)
print(sample_sub.columns.tolist())
print(sample_sub.head(3))

In [None]:
# Identify column names dynamically
# Expected: some ID column, Akkadian text column, English translation column
print('train dtypes:')
print(train.dtypes)
print()
print('test dtypes:')
print(test.dtypes)
print()
print('submission columns:', sample_sub.columns.tolist())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Detect columns: find the ID, source (Akkadian), and target (English) columns
sub_cols = sample_sub.columns.tolist()
id_col = sub_cols[0]  # First column is typically ID
target_col = sub_cols[1]  # Second column is what we need to predict

# Find Akkadian and English columns in train
train_cols = train.columns.tolist()
test_cols = test.columns.tolist()

print(f'ID column: {id_col}')
print(f'Target column (submission): {target_col}')
print(f'Train columns: {train_cols}')
print(f'Test columns: {test_cols}')

# The source text column should be present in both train and test
common_text_cols = [c for c in train_cols if c in test_cols and c != id_col]
print(f'Common text columns: {common_text_cols}')

# Source (Akkadian) column: present in both train and test
source_col = common_text_cols[0] if len(common_text_cols) == 1 else common_text_cols[0]

# English column in train: column that's in train but not in test (besides ID)
eng_candidates = [c for c in train_cols if c not in test_cols]
eng_col = eng_candidates[0] if eng_candidates else target_col

print(f'Source (Akkadian) column: {source_col}')
print(f'English column in train: {eng_col}')

In [None]:
# Fill NaN with empty string
train[source_col] = train[source_col].fillna('')
train[eng_col] = train[eng_col].fillna('')
test[source_col] = test[source_col].fillna('')

# TF-IDF on Akkadian text (character n-grams work well for non-English scripts)
vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 5),
    max_features=50000,
    sublinear_tf=True
)

train_tfidf = vectorizer.fit_transform(train[source_col])
test_tfidf = vectorizer.transform(test[source_col])

print(f'Train TF-IDF shape: {train_tfidf.shape}')
print(f'Test TF-IDF shape: {test_tfidf.shape}')

In [None]:
# Find nearest neighbor for each test sample
# Process in batches to avoid memory issues
BATCH_SIZE = 50
predictions = []

for i in range(0, test_tfidf.shape[0], BATCH_SIZE):
    batch = test_tfidf[i:i + BATCH_SIZE]
    sims = cosine_similarity(batch, train_tfidf)
    best_indices = sims.argmax(axis=1)
    best_scores = sims.max(axis=1)
    
    for j, (idx, score) in enumerate(zip(best_indices, best_scores)):
        pred = train[eng_col].iloc[idx]
        predictions.append(pred)
    
    if i % 200 == 0:
        print(f'Processed {i + len(best_indices)}/{test_tfidf.shape[0]}')

print(f'Total predictions: {len(predictions)}')

In [None]:
# Create submission
submission = pd.DataFrame({
    id_col: test[id_col],
    target_col: predictions
})

# Ensure no NaN in predictions
submission[target_col] = submission[target_col].fillna('unknown')

print(submission.head(10))
print(f'Shape: {submission.shape}')
print(f'Null count: {submission[target_col].isnull().sum()}')

submission.to_csv('/kaggle/working/submission.csv', index=False)
print('Saved to /kaggle/working/submission.csv')