In [41]:
# Download requirements for this notebook 
!pip install --upgrade pip
!pip install -q ipywidgets
!pip install "transformers<4.44" "sentence-transformers>=2.2.2"

[0m^C
Traceback (most recent call last):
  File "/usr/local/bin/pip", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/site-packages/pip/_internal/cli/main.py", line 11, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/site-packages/pip/_internal/cli/autocompletion.py", line 12, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/site-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.10/site-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.10/site-packages/pip/_internal/cli/spinners.py", line 22, in <module>
    from pip._internal.utils.logging import get_console, get_indentation
  File "/usr/local/lib/python3.10/site-package

In [43]:
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 2) Install deps (Kaggle usually has these, but safe to ensure)
!pip -q install sentence-transformers huggingface_hub

import shutil, json, glob, torch
from huggingface_hub import snapshot_download
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# 3) Start fresh: remove any broken local copy
local_dir = "./all-MiniLM-L6-v2"
shutil.rmtree(local_dir, ignore_errors=True)

# 4) Download the FULL repo to a portable folder (no symlinks!)
snap_path = snapshot_download(
    repo_id="sentence-transformers/all-MiniLM-L6-v2",
    local_dir=local_dir,
    local_dir_use_symlinks=False,   # <-- real files, not symlinks
    revision="main",                 # pin a branch/commit if you like
)

print("Snapshot downloaded to:", snap_path)


[0mUsing device: cpu


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Fetching 30 files: 100%|██████████| 30/30 [00:02<00:00, 10.07it/s]


Snapshot downloaded to: /kaggle/working/all-MiniLM-L6-v2


In [53]:
# --- 1) Get TPU device
import torch
import torch_xla.core.xla_model as xm

device = xm.xla_device()  # e.g. xla:1 on v3-8
print("TPU device:", device)
TARGET_DEVICE = device

TPU device: xla:0


In [54]:
import numpy as np 
from tqdm import tqdm, trange
from sentence_transformers import SentenceTransformer

tqdm.pandas()

MODEL_NAME = "all-MiniLM-L6-v2"
K = 5
# Data fields IDs
# dropped all in the list
ID_LABELS = ["dataset_id", "article_id", "id", "DOI", "url"]
# data columns included in the input batch
# dropped: issued, embedding, author, text
TRAIN_LABELS = [ 'title', 'segments', 'extension', 'abstract', 'publisher', 'copyright', 'issued_year', 'all_authors', 'categories']
ALL_FIELDS = [
    'article_id','text','extension','source','dataset_id','dataset_id_cited','type',
    'id','categories','abstract','DOI','publisher','title','URL','copyright',
    'issued_year','all_authors','y'
]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LOCAL_MODEL_PATH = snap_path

class Basement(SentenceTransformer):
    def __init__(self, initial_data: pd.DataFrame):
        
        super().__init__(LOCAL_MODEL_PATH, DEVICE)

        self.x = self.preprocess_data(initial_data)
        self.to(DEVICE)
        self._target_device(TARGET_DEVICE)
    
        self.eval()
        print("Model target device:", model._target_device)
        # freeze all parameters
        for p in self.parameters():
            p.requires_grad = False
            
    def _meta_conditioner(self, row):
        combined_prompt = f"""
        Title: {row.get('title', '')}
        Abstract: {row.get('abstract', '')}
        Author: {row.get('all_authors', '')}
        Extension File Type: {row.get('extension', '')}
        Publisher: {row.get('publisher', '')}
        Category: {row.get('categories', '')}
        Issued Year: {row.get('issued_year', '')}
        Copyright: {row.get('copyright', '')}
        """
        meta_embedding = self.encode([combined_prompt], convert_to_numpy=True, output_value="sentence_embedding") # or token_embeddings
        text_embedding = row.get('segments', None)

        if isinstance(text_embedding, np.ndarray):
            if text_embedding.ndim == 1:
                text_embedding = np.expand_dims(text_embedding, axis=0)
            return np.vstack((meta_embedding, text_embedding))

        # fallback empty vector if segments missing
        return np.empty((1, self.get_sentence_embedding_dimension()), dtype=np.float32)

    def _segment_text(self, s: str, k: int = K) -> np.ndarray:
        if not isinstance(s, str) or not s:
            return np.zeros((k, self.get_sentence_embedding_dimension()), dtype=np.float32)

        n = len(s)
        idx = np.linspace(0, n, k + 1, dtype=int)
        chunks = [s[idx[i]:idx[i+1]] for i in range(k)]
        emb = self.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
        emb = np.asarray(emb, dtype=np.float32)
        return emb.reshape(k, -1)

    def preprocess_data(
        self,
        data: pd.DataFrame,
        k: int = K,
        train_labels: list[str] | None = None
    ) -> pd.DataFrame:
        """
        - Adds 'segments' ((k, d) embeddings) from 'text'
        - Explodes: all_authors, segments, categories
        - Drops duplicates by TRAIN_LABELS (excluding 'segments')
        - Fills: copyright/cats/issued_year (mode)
        """
        train_labels = train_labels or TRAIN_LABELS

        df = data.copy()

        # 1) create 'segments'
        df.loc[:, "segments"] = df["text"].apply(lambda x: self._segment_text(x, k))

        # 2) explode multi-valued columns
        for col in ("all_authors", "segments", "categories"):
            if col in df.columns:
                df = df.explode(col, ignore_index=True)

        # 3) drop duplicated rows (excluding 'segments')
        subset_cols = [c for c in train_labels if c in df.columns and c != "segments"]
        if subset_cols:
            df = df.drop_duplicates(subset=subset_cols, keep="first")

        # 4) fills / cleaning
        if "issued_year" in df.columns:
            mode_series = df["issued_year"].dropna().mode()
            most_freq_issued = mode_series.iloc[0] if not mode_series.empty else None
            if most_freq_issued is not None:
                df.loc[:, "issued_year"] = df["issued_year"].fillna(most_freq_issued)

        if "copyright" in df.columns:
            df.loc[:, "copyright"] = (
                df["copyright"].astype("string").str.strip().replace({"": "Unknown"})
            )

        if "categories" in df.columns:
            df.loc[:, "categories"] = df["categories"].astype("string").fillna("Unknown")

        df.loc[:, "inputs"] = df.apply(self._meta_conditioner, axis=1)
        
        return df

    def predict(self, inputs: dict):
        max_pred = None
        if text := inputs.get('text', None):
            segment = self._segment_text(text)
            inputs.update({'segments': segment})
            input_emb = self._meta_conditioner(inputs)

            # Ensure input_emb is 2D: (1, D)
            if input_emb.ndim == 1:
                input_emb = input_emb.reshape(1, -1)

            print(f"test input emb shape: {input_emb.shape}")

            for grp, row in self.x.groupby(['article_id', 'dataset_id', 'source']):
                train_sample = np.vstack(row['inputs'].values)

                # Ensure train_sample is 2D as well
                if train_sample.ndim == 1:
                    train_sample = train_sample.reshape(1, -1)

                # Now both shapes are (N, D) vs (1, D)
                avg = self.similarity_pairwise(train_sample, input_emb).mean().item()
                
                prev_score = max_pred.get('score', None) if max_pred is not None else None
                if max_pred is None or (prev_score is not None and avg > prev_score):
                    max_pred = {
                        'article_id': grp[0],
                        'dataset_id': grp[1],
                        'type': grp[2],
                        'score': avg,
                    }

        return max_pred


In [45]:
# Load the dataset 

import pandas as pd 

train_data = pd.read_parquet('/kaggle/input/make-data-count-data-preparation/train_dataset.parquet')
test_data = pd.read_parquet('/kaggle/input/make-data-count-data-preparation/test_dataset.parquet')

In [None]:
base = Basement(train_data)



In [None]:
prediction = []

for _, row in test_data.iterrows():
    pred = base.predict(row.to_dict())   # convert Series → dict
    if pred:  # skip empty predictions
        prediction.append(pred)

submission = pd.DataFrame.from_records(prediction)

# Mapping type strings → ints
pred_map_idx = {"Primary": 0, "Secondary": 1, "Missing": 2, "Unknown": 3}
submission = submission.dropna(subset=['article_id', 'dataset_id', 'type'])
submission['type'] = submission['type'].map(pred_map_idx)

# Deduplicate and sort
submission = submission.drop_duplicates(subset=["article_id", "dataset_id", "type"])
submission = submission.sort_values(["article_id", "dataset_id", "type"]).reset_index(drop=True)

# Add row_index (row_id) column
submission['row_index'] = submission.index

# Save in required order
submission[['row_index', 'article_id', 'dataset_id', 'type']].to_csv('submission.csv', index=False)