In [1]:
# ==============================================================================
# 1. INSTALL AND IMPORT PACKAGES
# ==============================================================================
# Install the required spacy model
!python -m spacy download en_core_web_sm

# Import necessary libraries
import pandas as pd
import re
from collections import Counter, defaultdict
import spacy
from spacy.tokens import Doc
from itertools import product
import time
import tracemalloc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ==============================================================================
# 2. DEFINE DATA NORMALIZATION CLASS
# ==============================================================================
class NormalizedInput:
    """
    Reads and processes the input CSV to extract and count words from a text column.
    """
    def __init__(self, csv_path, text_column):
        self.csv_path = csv_path
        self.text_column = text_column
        self.df = None
        self.processed_df = None

    def read_csv(self):
        print("[NormalizedInput] Reading CSV file:", self.csv_path)
        self.df = pd.read_csv(self.csv_path)
        print("[NormalizedInput] CSV loaded. Shape:", self.df.shape)

    def normalize_text(self, text):
        # Finds all alphabetic words in a lowercased string
        words = re.findall(r'\b[a-zA-Z]+\b', str(text).lower())
        return words

    def count_words(self, words):
        return dict(Counter(words))

    def process(self):
        tracemalloc.start()
        start_time = time.time()
        print("[NormalizedInput] Starting processing of DataFrame.")
        if self.df is None:
            self.read_csv()
        processed_dict = {}
        for idx, row in self.df.iterrows():
            words = self.normalize_text(row[self.text_column])
            word_count = self.count_words(words)
            row_id = row['id'] if 'id' in row else idx
            processed_dict[row_id] = word_count
        self.processed_df = processed_dict
        end_time = time.time()
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        print(f"Time taken: {end_time - start_time:.4f} seconds")
        print(f"Current memory usage: {current / 10**6:.4f} MB; Peak: {peak / 10**6:.4f} MB")

    def get_processed_dictionary(self):
        if self.processed_df is None:
            self.process()
        return self.processed_df

# ==============================================================================
# 3. DEFINE TEMPLATE-BASED SENTENCE GENERATION CLASS
# ==============================================================================
# Define Part-of-Speech (POS) templates from longest to shortest
TEMPLATES = [
    ['determiner', 'adjective', 'noun', 'verb', 'preposition', 'determiner', 'noun'],
    ['determiner', 'adjective', 'noun'],
    ['determiner', 'noun', 'verb', 'noun'],
    ['noun', 'verb', 'noun'],
    ['noun', 'verb', 'preposition', 'noun'],
    ['noun', 'verb', 'determiner', 'noun'],
    ['noun', 'verb', 'adjective', 'noun'],
    ['noun', 'verb', 'conjunction', 'noun'],
    ['noun', 'verb'],
    ['noun', 'noun'],
    ['determiner', 'noun'],
]

class TemplatesGeneration:
    """
    Generates sentences by filling POS templates with words from the input text.
    """
    def __init__(self, normalized_input):
        self.normalized_input = normalized_input
        self.processed_dict = self.normalized_input.get_processed_dictionary()
        self.all_sentences = []
        self.nlp = spacy.load("en_core_web_sm")

    def classify_words_in_row(self, row_id, words):
        doc = Doc(self.nlp.vocab, words=words)
        for name, proc in self.nlp.pipeline:
            doc = proc(doc)
        results = []
        for token in doc:
            pos = token.pos_.lower()
            results.append({'id': row_id, 'word': token.text, 'pos': pos})
        return results

    def group_words_by_pos_with_duplicates(self, classified_words):
        grouped = defaultdict(list)
        word_pos_count = defaultdict(int)
        for item in classified_words:
            pos = item['pos']
            grouped[pos].append((item['word'], word_pos_count[(item['word'], pos)]))
            word_pos_count[(item['word'], pos)] += 1
        return grouped

    def generate_sentences_for_template_with_duplicates(self, word_pos_dict, template):
        if not all(pos in word_pos_dict and word_pos_dict[pos] for pos in template):
            return []
        slot_word_occurrences = [word_pos_dict[pos] for pos in template]
        all_assignments = product(*slot_word_occurrences)
        sentences = set()
        for assignment in all_assignments:
            if len(set(assignment)) == len(assignment):
                sentence = ' '.join(word for word, idx in assignment)
                sentences.add(sentence)
        return list(sentences)

    def get_largest_template(self, word_pos_dict):
        for template in TEMPLATES:
            if all(pos in word_pos_dict and word_pos_dict[pos] for pos in template):
                return template
        return None

    def get_unused_words(self, all_words, used_words):
        return set(all_words) - set(used_words)

    def generate(self, output_path):
        print("[TemplatesGeneration] Starting sentence generation.")
        for row_id, word_counts in self.processed_dict.items():
            words = []
            for word, count in word_counts.items():
                words.extend([word] * count)
            classified = self.classify_words_in_row(row_id, words)
            word_pos_dict = self.group_words_by_pos_with_duplicates(classified)
            all_words = [item['word'] for item in classified]
            used_words = set()
            generated_sentences = []

            largest_template = self.get_largest_template({k: [w for w, i in v] for k, v in word_pos_dict.items()})
            if largest_template:
                generated_sentences = self.generate_sentences_for_template_with_duplicates(word_pos_dict, largest_template)
                for sentence in generated_sentences:
                    used_words.update(sentence.split())

            unused_words = self.get_unused_words(all_words, used_words)
            if unused_words:
                unused_word_pos_dict = {cat: [wi for wi in word_pos_dict[cat] if wi[0] in unused_words] for cat in word_pos_dict}
                for template in TEMPLATES:
                    fallback_sentences = self.generate_sentences_for_template_with_duplicates(unused_word_pos_dict, template)
                    for sentence in fallback_sentences:
                        words_in_sentence = set(sentence.split())
                        if words_in_sentence & unused_words:
                            generated_sentences.append(sentence)
                            used_words.update(words_in_sentence)
                    unused_words = self.get_unused_words(all_words, used_words)
                    if not unused_words:
                        break

            for sentence in generated_sentences:
                self.all_sentences.append({'id': row_id, 'sentence': sentence})

        pd.DataFrame(self.all_sentences).to_csv(output_path, index=False)
        print(f"[TemplatesGeneration] All generated sentences saved to {output_path}")

# ==============================================================================
# 4. DEFINE FINAL PASSAGE GENERATION CLASS
# ==============================================================================
class SentencesGeneration:
    """
    Combines generated sentences and appends missing words based on perplexity
    using a causal language model (Gemma 2).
    """
    def __init__(self, sentence_csv_path, processed_dict, model, tokenizer):
        self.sentence_csv_path = sentence_csv_path
        self.processed_dict = processed_dict
        self.sent_df = pd.read_csv(sentence_csv_path)
        self.final_sentences = []
        self.model = model
        self.tokenizer = tokenizer

    def get_perplexity_with_gemma2(self, text):
        """
        Computes perplexity for a single text string using the provided model.
        """
        inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
        perplexity = torch.exp(loss).item()
        return perplexity

    def combine_sentences_per_row(self):
        print("[SentencesGeneration] Starting final passage combination.")
        for row_id, word_count_dict in self.processed_dict.items():
            original_bag = []
            for word, count in word_count_dict.items():
                original_bag.extend([word] * count)
            original_counter = Counter(original_bag)
            n_words = len(original_bag)

            row_sentences = self.sent_df[self.sent_df['id'] == row_id]['sentence'].tolist()
            sentence_word_lists = [s.split() for s in row_sentences]

            used_counter = Counter()
            selected_sentences = []
            # Greedily select longest sentences first that don't overuse words
            for words, sent in sorted(zip(sentence_word_lists, row_sentences), key=lambda x: -len(x[0])):
                temp_counter = Counter(words)
                if all(used_counter[w] + temp_counter[w] <= original_counter[w] for w in temp_counter):
                    selected_sentences.append(sent)
                    used_counter += temp_counter
                if sum(used_counter.values()) == n_words:
                    break

            final_counter = Counter()
            for sent in selected_sentences:
                final_counter += Counter(sent.split())

            missing_words = list((original_counter - final_counter).elements())
            current_combined_sentence = " ".join(selected_sentences)

            # Append missing words one by one, choosing the one that minimizes perplexity
            while missing_words:
                candidates = []
                for word in set(missing_words):
                    trial_sentence = current_combined_sentence + " " + word
                    perplexity = self.get_perplexity_with_gemma2(trial_sentence)
                    
                    # <<< PRINT STATEMENT ADDED HERE >>>
                    print(f"  > Evaluating word '{word}'. Perplexity: {perplexity:.4f}")
                    
                    candidates.append((word, perplexity))
                
                if not candidates: break # Break if no candidates found

                best_word, best_perplexity = min(candidates, key=lambda x: x[1])
                print(f"  >> Selected best word '{best_word}' with perplexity {best_perplexity:.4f}\n")
                current_combined_sentence += " " + best_word
                missing_words.remove(best_word)

            self.final_sentences.append({
                "id": row_id,
                "text": current_combined_sentence.strip() # Match submission format
            })

    def save(self, output_path):
        submission_df = pd.DataFrame(self.final_sentences)
        submission_df.to_csv(output_path, index=False)
        print("\nFinal Submission DataFrame:")
        print(submission_df)
        print(f"\n[SentencesGeneration] Final submission saved to {output_path}")

# ==============================================================================
# 5. MAIN EXECUTION SCRIPT
# ==============================================================================
if __name__ == '__main__':
    # --- Configuration ---
    INPUT_CSV_PATH = '/kaggle/input/santa-2024/sample_submission.csv'
    TEXT_COLUMN = 'text'
    MODEL_PATH = "/kaggle/input/gemma-2/transformers/gemma-2-2b/2/"
    TEMP_SENTENCES_PATH = '/kaggle/working/templates_generation.csv'
    FINAL_OUTPUT_PATH = '/kaggle/working/submission.csv'

    # --- Step 1: Load and process the initial data ---
    normalizer = NormalizedInput(csv_path=INPUT_CSV_PATH, text_column=TEXT_COLUMN)
    processed_word_dict = normalizer.get_processed_dictionary()

    # --- Step 2: Generate sentences from templates ---
    template_generator = TemplatesGeneration(normalized_input=normalizer)
    template_generator.generate(output_path=TEMP_SENTENCES_PATH)
    
    # --- Step 3: Load the Causal Language Model and Tokenizer ---
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Main] Using device: {device}")

    # Load tokenizer and model from the specified path
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForCausalLM.from_pretrained(MODEL_PATH).to(device)
    print("[Main] Model and tokenizer loaded successfully.")

    # --- Step 4: Combine sentences into final passages ---
    sentence_combiner = SentencesGeneration(
        sentence_csv_path=TEMP_SENTENCES_PATH,
        processed_dict=processed_word_dict,
        model=model,
        tokenizer=tokenizer
    )
    sentence_combiner.combine_sentences_per_row()
    
    # --- Step 5: Save the final submission file ---
    sentence_combiner.save(output_path=FINAL_OUTPUT_PATH)

    print("\n[Main] End-to-end process completed.")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[NormalizedInput] Starting processing of DataFrame.
[NormalizedInput] Reading CSV file: /kaggle/input/santa-2024/sample_submission.csv
[NormalizedInput] CSV loaded. Shape: (6, 2)
Time taken: 0.0294 seconds
Current memory usage: 0.0876 MB; Peak: 0.8493 MB
[TemplatesGeneration] Starting sentence generation.
[Templates

2025-07-12 04:12:07.433386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752293527.681581      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752293527.748223      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[Main] Model and tokenizer loaded successfully.
[SentencesGeneration] Starting final passage combination.
  > Evaluating word 'advent'. Perplexity: 58087.5430
  > Evaluating word 'chimney'. Perplexity: 68447.8984
  > Evaluating word 'gingerbread'. Perplexity: 49989.4727
  > Evaluating word 'elf'. Perplexity: 50517.7031
  > Evaluating word 'ornament'. Perplexity: 34703.8203
  > Evaluating word 'mistletoe'. Perplexity: 22191.8340
  >> Selected best word 'mistletoe' with perplexity 22191.8340

  > Evaluating word 'advent'. Perplexity: 15899.4199
  > Evaluating word 'chimney'. Perplexity: 15930.8535
  > Evaluating word 'gingerbread'. Perplexity: 13274.8760
  > Evaluating word 'elf'. Perplexity: 13280.6250
  > Evaluating word 'ornament'. Perplexity: 12444.9180
  >> Selected best word 'ornament' with perplexity 12444.9180

  > Evaluating word 'elf'. Perplexity: 12189.0205
  > Evaluating word 'advent'. Perplexity: 10238.0137
  > Evaluating word 'gingerbread'. Perplexity: 10746.6318
  > Evalua