In [1]:
import pandas as pd
import os

In [2]:

def create_sample_dataset():
    """创建示例代码翻译数据集"""
    
    sample_data = [
        # Java到Python的简单翻译示例
        {
            'source_code': 'public class Hello { public static void main(String[] args) { System.out.println("Hello"); } }',
            'target_code': 'print("Hello")'
        },
        {
            'source_code': 'for (int i = 0; i < 10; i++) { System.out.println(i); }',
            'target_code': 'for i in range(10):\n    print(i)'
        },
        {
            'source_code': 'if (x > 5) { return true; } else { return false; }',
            'target_code': 'return x > 5'
        },
        {
            'source_code': 'int sum = a + b;',
            'target_code': 'sum = a + b'
        },
        {
            'source_code': 'List<String> names = new ArrayList<>();',
            'target_code': 'names = []'
        },
        {
            'source_code': 'public int add(int a, int b) { return a + b; }',
            'target_code': 'def add(a, b):\n    return a + b'
        },
        {
            'source_code': 'try { riskyOperation(); } catch (Exception e) { e.printStackTrace(); }',
            'target_code': 'try:\n    risky_operation()\nexcept Exception as e:\n    print(e)'
        },
        {
            'source_code': 'boolean isEmpty = list.isEmpty();',
            'target_code': 'is_empty = len(list) == 0'
        },
        {
            'source_code': 'while (i < 10) { i++; }',
            'target_code': 'while i < 10:\n    i += 1'
        },
        {
            'source_code': 'public boolean isEven(int n) { return n % 2 == 0; }',
            'target_code': 'def is_even(n):\n    return n % 2 == 0'
        }
    ]
    
    # 创建数据目录
    os.makedirs('data', exist_ok=True)
    
    # 转换为DataFrame
    df = pd.DataFrame(sample_data)
    
    # 分割数据集 (60% 训练, 20% 验证, 20% 测试)
    total_samples = len(df)
    train_size = int(0.6 * total_samples)
    val_size = int(0.2 * total_samples)
    
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:train_size + val_size]
    test_df = df.iloc[train_size + val_size:]
    
    # 保存数据集
    train_df.to_csv('data/train.csv', index=False)
    val_df.to_csv('data/val.csv', index=False)
    test_df.to_csv('data/test.csv', index=False)
    
    print(f"数据集创建完成:")
    print(f"- 训练集: {len(train_df)} 个样本")
    print(f"- 验证集: {len(val_df)} 个样本") 
    print(f"- 测试集: {len(test_df)} 个样本")
    
    # 显示一些样本
    print("\n数据样本示例:")
    for i in range(min(2, len(train_df))):
        print(f"\n样本 {i+1}:")
        print(f"源代码: {train_df.iloc[i]['source_code']}")
        print(f"目标代码: {train_df.iloc[i]['target_code']}")
    
    return train_df, val_df, test_df

# 创建数据集
train_df, val_df, test_df = create_sample_dataset()

In [3]:
create_sample_data()

创建了示例数据集:
- 训练集: 6 个样本
- 验证集: 2 个样本


(                                         source_code  \
 0  public class Hello { public static void main(S...   
 1  for (int i = 0; i < 10; i++) { System.out.prin...   
 2  if (x > 5) { return true; } else { return fals...   
 3                                   int sum = a + b;   
 4            List<String> names = new ArrayList<>();   
 5     public int add(int a, int b) { return a + b; }   
 
                          target_code  
 0                     print("Hello")  
 1  for i in range(10):\n    print(i)  
 2                       return x > 5  
 3                        sum = a + b  
 4                         names = []  
 5   def add(a, b):\n    return a + b  ,
                                          source_code  \
 6  try { riskyOperation(); } catch (Exception e) ...   
 7                  boolean isEmpty = list.isEmpty();   
 
                                          target_code  
 6  try:\n    risky_operation()\nexcept Exception ...  
 7                          is_em