In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#加载训练数据到 Notebook
train_path = "/kaggle/input/train-data/train.csv"
test_path = "/kaggle/input/test-data/test.csv"

# 读取训练集数据，了解数据格式
train_df = pd.read_csv(train_path)
print("训练集前5行：")
print(train_df.head())

# 分析文本描述长度分布
train_df['desc_length'] = train_df['description'].apply(lambda x: len(str(x).split()))
plt.hist(train_df['desc_length'], bins=40, color='skyblue', edgecolor='black')
plt.xlabel("Number of descriptors")
plt.ylabel("frequency")
plt.title("Distribution of the number of descriptors in the training set")
plt.show()

In [None]:
# Step 2: 加载大语言模型（LLM）并生成SVG代码
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# 1) 指定你想要使用的模型名称，例如 "bigcode/starcoder"、"bigcode/starcoderbase" 等
model_name = "/kaggle/input/nousresearchdeephermes-3-llama-3-8b-preview/transformers/default/1/NousResearchDeepHermes-3-Llama-3-8B-Preview"

# 2) 禁用 tokenizer 的自动截断
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    truncation=False  # 禁用截断
)

# 3) 加载模型，使用半精度和自动设备映射，降低内存占用
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,     # 使用FP16
    low_cpu_mem_usage=True,        # 减少CPU内存
    device_map="auto"              # 自动分配GPU/CPU
)

# 4) 创建生成 pipeline（不再手动指定 device=0）
llm_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# 5) 示例prompt：明确要求完整SVG
sample_prompt = (
    "Generate a valid and complete SVG code for a red apple on a white background.\n"
    "Requirements:\n"
    "1. Must have <svg> ... </svg> tags.\n"
    "2. Must define at least one <ellipse> or <circle> as the apple body.\n"
    "3. Must define a <path> or <rect> as the leaf.\n"
    "No extra explanations or markdown. Output only the SVG code.\n"
    "SVG Code:"
)

# 6) 调用 pipeline 进行文本生成，增大 max_length 避免被截断
sample_output = llm_generator(
    sample_prompt,
    max_length=512,           # 增大生成长度
    num_return_sequences=1,
    do_sample=True,           # 允许采样
    top_k=50,                 # 采样时可微调 top_k、top_p
    top_p=0.9
)

print("示例输出：")
print(sample_output[0]['generated_text'])

In [None]:
# Step 3: 定义 Model 类和 predict() 函数
import re

class Model:
    def __init__(self):
        # 如果需要，可以在初始化中加载其它依赖或参数
        self.max_svg_bytes = 10000  # SVG 文件大小上限，单位字节

    def predict(self, prompt: str) -> str:
        """
        根据文本描述生成 SVG 代码
        :param prompt: 文本描述
        :return: 满足要求的 SVG 代码字符串
        """
        # 构建完整的提示语，指导模型生成合法的 SVG 代码
        # *** 注意：你可以根据实际情况修改这里的 prompt 格式和细节 ***
        full_prompt = (
            "请生成一段有效的 SVG 图像代码，代码必须满足以下要求：\n"
            "1. SVG代码必须小于10000字节。\n"
            "2. 仅使用允许的 SVG 元素和属性，不包含CSS样式和外部数据。\n"
            "3. 生成的代码能够正确渲染出对应的图像。\n"
            "请根据以下描述生成SVG代码：\n"
            f"{prompt}\n"
            "SVG Code:"
        )
        
        # 使用大语言模型生成文本
        # *** 注意：max_length、temperature 等参数可根据实际情况调整 ***
        generation_output = llm_generator(
            full_prompt,
            max_length=1024,  # 根据实际需要调整生成长度
            num_return_sequences=1,
            temperature=0.7
        )
        generated_text = generation_output[0]['generated_text']
        
        # 后处理：提取 <svg> 到 </svg> 之间的代码
        svg_match = re.search(r'(<svg\b[^>]*>.*?</svg>)', generated_text, re.DOTALL)
        if svg_match:
            svg_code = svg_match.group(1)
        else:
            # 如果未能成功提取，可直接使用全部文本，或抛出异常
            svg_code = generated_text
        
        # 检查生成的 SVG 代码大小是否符合要求
        svg_bytes = svg_code.encode('utf-8')
        if len(svg_bytes) > self.max_svg_bytes:
            # 如果超出限制，采取截断或其它处理方式
            # *** 注意：截断可能导致 SVG 不完整，请根据实际情况设计更优的处理方式 ***
            svg_code = svg_code[:self.max_svg_bytes]
            print("警告：生成的 SVG 代码超出大小限制，已进行截断处理。")
        
        # 返回处理后的 SVG 代码
        return svg_code

# 示例：使用 Model 类测试生成
my_model = Model()
test_prompt = "一个蓝色的天空中漂浮着一朵白云"
generated_svg = my_model.predict(test_prompt)
print("生成的 SVG 代码示例：")
print(generated_svg)

In [None]:
# Step 4: 对测试数据进行推理并生成提交文件
import tqdm  # 如果未安装，请使用 !pip install tqdm

# 读取测试集数据（请确保文件路径正确）
test_df = pd.read_csv(test_path)
print("测试集数据示例：")
print(test_df.head())

# 假设测试集有两列：id 和 description（如果列名不同，请修改相应代码）
predictions = []
for idx, row in test_df.iterrows():
    desc = row['description']  # 修改此处以匹配你数据集的文本描述列名称
    svg_code = my_model.predict(desc)
    predictions.append(svg_code)

# 创建提交 DataFrame，确保提交格式为：id,prediction
submission_df = pd.DataFrame({
    'id': test_df['id'],      # 确保 id 列名称与比赛要求一致
    'prediction': predictions
})

# 保存为 submission.csv
submission_df.to_csv("submission.csv", index=False)
print("提交文件生成成功！")