In [1]:
# 这里使用codebert实现了java到python的代码翻译功能使用的是预训练codebert模型
#模型是出自于微软的microsoft/codebert-base

In [2]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

In [3]:
# 任务3：数据集类定义
# 运行这个单元格定义数据加载器

class CodeTranslationDataset(Dataset):
    def __init__(self, source_codes, target_codes, tokenizer, max_length=64):
        self.source_codes = source_codes
        self.target_codes = target_codes
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.source_codes)
    
    def __getitem__(self, idx):
        source_code = str(self.source_codes[idx])
        target_code = str(self.target_codes[idx])
        
        # Tokenize source code
        source_encoding = self.tokenizer(
            source_code,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Tokenize target code
        target_encoding = self.tokenizer(
            target_code,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

def load_data(file_path):
    
    df = pd.read_csv(file_path)
    return df['source_code'].tolist(), df['target_code'].tolist()

In [4]:
# 构建CodeBert的模型

class ImprovedCodeBERTTranslator(nn.Module):
    def __init__(self, model_name='microsoft/codebert-base', vocab_size=50265, max_length=64):
        super(ImprovedCodeBERTTranslator, self).__init__()
        
        # 加载预训练的CodeBERT模型
        self.codebert = RobertaModel.from_pretrained(model_name)
        self.hidden_size = self.codebert.config.hidden_size
        
        # 解码器 
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, vocab_size)
        )
        
        print(f"模型初始化完成: hidden_size={self.hidden_size}, vocab_size={vocab_size}")
        
    def forward(self, input_ids, attention_mask, labels=None):
        # 编码器前向传播
        encoder_outputs = self.codebert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # 使用[CLS] token的输出作为序列表示
        cls_output = encoder_outputs.last_hidden_state[:, 0, :]
        
        # 解码
        logits = self.decoder(cls_output)
        
        return logits

In [5]:
def improved_train_model(model, train_loader, val_loader, epochs=10, learning_rate=5e-5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=1)  # 忽略padding token
    
    train_losses = []
    val_losses = []
    
    print(f"训练样本数: {len(train_loader.dataset)}")
    print(f"验证样本数: {len(val_loader.dataset)}")
    
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        train_correct = 0
        train_total = 0
        
        # 训练阶段
        train_pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]')
        for batch_idx, batch in enumerate(train_pbar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids, attention_mask, labels)
            
            # 计算准确率 - 预测目标序列的第一个token
            preds = torch.argmax(outputs, dim=-1)
            targets = labels[:, 0]  # 目标序列的第一个token
            
            correct = (preds == targets).sum().item()
            total = targets.size(0)
            train_correct += correct
            train_total += total
            
            loss = criterion(outputs, targets)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            train_pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{correct/total:.4f}' if total > 0 else '0.0000'
            })
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = train_correct / train_total if train_total > 0 else 0
        train_losses.append(avg_train_loss)
        
        # 验证阶段
        model.eval()
        total_val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]')
            for batch in val_pbar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask, labels)
                preds = torch.argmax(outputs, dim=-1)
                targets = labels[:, 0]
                
                correct = (preds == targets).sum().item()
                total = targets.size(0)
                val_correct += correct
                val_total += total
                
                loss = criterion(outputs, targets)
                total_val_loss += loss.item()
                
                val_pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{correct/total:.4f}' if total > 0 else '0.0000'
                })
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = val_correct / val_total if val_total > 0 else 0
        val_losses.append(avg_val_loss)
        
        print(f'Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
    
    return train_losses, val_losses

In [6]:
def translate_code(model, tokenizer, source_code, device):
        model.eval()
        
        # 编码源代码
        encoding = tokenizer(
            source_code,
            truncation=True,
            padding='max_length',
            max_length=64,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            
            # 获取预测结果
            predicted_token_id = torch.argmax(outputs, dim=-1).item()
            
            # 解码预测的token
            predicted_token = tokenizer.decode([predicted_token_id], skip_special_tokens=True)
            
            # 获取top-k预测结果
            top_k = 5
            probs = torch.softmax(outputs, dim=-1)
            top_probs, top_indices = torch.topk(probs, top_k)
            
            print(f"Top-{top_k} 预测:")
            for i in range(top_k):
                token_id = top_indices[0, i].item()
                token = tokenizer.decode([token_id], skip_special_tokens=True)
                prob = top_probs[0, i].item()
                print(f"  {i+1}. '{token}' (ID: {token_id}, 概率: {prob:.4f})")
        
        print(f"最终翻译结果: '{predicted_token}'")
        return predicted_token

In [7]:
def evaluate_model(model, tokenizer, test_samples, device):
    results = []
    
    
    for i, (source, target) in enumerate(test_samples):
        print(f"\n--- 测试样本 {i+1}/{len(test_samples)} ---")
        translated = improved_translate_code(model, tokenizer, source, device)
        
        # 检查翻译结果
        exact_match = translated.strip() == target.strip()
        
        results.append({
            'source': source,
            'target': target,
            'translated': translated,
            'exact_match': exact_match
        })
        
        print(f"期望结果: '{target}'")
        print(f"是否匹配: {exact_match}")
    
    exact_match_accuracy = sum([r['exact_match'] for r in results]) / len(results)
    print(f"\n精确匹配准确率: {exact_match_accuracy:.4f} ({sum([r['exact_match'] for r in results])}/{len(results)})")
    
    return results, exact_match_accuracy

In [8]:
def main():
    # 配置参数
    config = {
        'batch_size': 4,
        'max_length': 64,  # 序列长度
        'learning_rate': 3e-5,  # 学习率
        'epochs': 15,      # 训练轮数
        'model_name': 'microsoft/codebert-base'
    }
    
    print(f"配置参数: {config}")
    
    # 加载tokenizer
    tokenizer = RobertaTokenizer.from_pretrained(config['model_name'])
    print(f"Tokenizer词汇表大小: {tokenizer.vocab_size}")
    
    # 加载数据
    train_source, train_target = load_data('data/train.csv')
    val_source, val_target = load_data('data/val.csv')
    test_source, test_target = load_data('data/test.csv')
    
    print(f"数据集大小:")
    print(f"- 训练集: {len(train_source)} 个样本")
    print(f"- 验证集: {len(val_source)} 个样本")
    print(f"- 测试集: {len(test_source)} 个样本")
    
    # 显示一些训练样本
    print("\n训练样本示例:")
    for i in range(min(3, len(train_source))):
        print(f"{i+1}. 源: {train_source[i]} -> 目标: {train_target[i]}")
    
    # 创建数据集
    train_dataset = CodeTranslationDataset(train_source, train_target, tokenizer, config['max_length'])
    val_dataset = CodeTranslationDataset(val_source, val_target, tokenizer, config['max_length'])
    
    # 创建数据加载器
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])
    
    # 初始化模型
    model = ImprovedCodeBERTTranslator(
        model_name=config['model_name'],
        vocab_size=tokenizer.vocab_size,
        max_length=config['max_length']
    )
    
    model_path = 'improved_codebert_translator.pth'
    
    # 训练模型
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")
    model = model.to(device)
    
    print("\n训练模型...")
    train_losses, val_losses = train_model(
        model, train_loader, val_loader,
        epochs=config['epochs'],
        learning_rate=config['learning_rate']
    )
    
    # 保存模型
    torch.save(model.state_dict(), model_path)
    print(f"模型已保存到 '{model_path}'")
    
    # 测试模型
    print("\n测试模型...")
    test_samples = list(zip(test_source, test_target))
    print(f"测试样本数量: {len(test_samples)}")
    
    results, accuracy = evaluate_model(model, tokenizer, test_samples, device)
    
    # 演示翻译功能
    print("\n" + "="*60)
    print("="*60)
    
    demo_samples = [
        'System.out.println("hello world");',
        'int x = 5;',
        'if (x > 0) { return true; }'
    ]
    
    for i, source_code in enumerate(demo_samples):
        print(f"\n演示 {i+1}:")
        translated = translate_code(model, tokenizer, source_code, device)
        print(f"输入: {source_code}")
        print(f"输出: {translated}")

In [9]:
if __name__ == "__main__":
    main()

配置参数: {'batch_size': 4, 'max_length': 64, 'learning_rate': 3e-05, 'epochs': 15, 'model_name': 'microsoft/codebert-base'}


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/codebert-base/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001C982DB14C0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 849c42be-e2f2-4401-97d1-f883402d2d82)')' thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/codebert-base/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001C98476CE90>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 19c94bec-814b-4150-bbde-74010afd2a3c)')' thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokeni

Tokenizer词汇表大小: 50265
数据集大小:
- 训练集: 7 个样本
- 验证集: 2 个样本
- 测试集: 1 个样本

训练样本示例:
1. 源: System.out.println("hello"); -> 目标: print("hello")
2. 源: int x = 10; -> 目标: x = 10
3. 源: if (x > 5) { return true; } -> 目标: if x > 5: return True
模型初始化完成: hidden_size=768, vocab_size=50265
使用设备: cuda

训练模型...
训练样本数: 7
验证样本数: 2


Epoch 1/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:01<00:00,  1.85it/s, loss=10.8950, acc=0.0000]
Epoch 1/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 45.24it/s, loss=10.8274, acc=0.0000]


Epoch 1: Train Loss: 10.9249, Train Acc: 0.0000, Val Loss: 10.8274, Val Acc: 0.0000


Epoch 2/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.40it/s, loss=10.8548, acc=0.0000]
Epoch 2/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 62.37it/s, loss=10.7458, acc=0.0000]


Epoch 2: Train Loss: 10.8309, Train Acc: 0.0000, Val Loss: 10.7458, Val Acc: 0.0000


Epoch 3/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.22it/s, loss=10.6796, acc=0.0000]
Epoch 3/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 67.58it/s, loss=10.6510, acc=0.0000]


Epoch 3: Train Loss: 10.7143, Train Acc: 0.0000, Val Loss: 10.6510, Val Acc: 0.0000


Epoch 4/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.61it/s, loss=10.6226, acc=0.0000]
Epoch 4/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 47.31it/s, loss=10.5454, acc=0.0000]


Epoch 4: Train Loss: 10.6677, Train Acc: 0.0000, Val Loss: 10.5454, Val Acc: 0.0000


Epoch 5/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.27it/s, loss=10.5363, acc=0.0000]
Epoch 5/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 66.44it/s, loss=10.4293, acc=0.0000]


Epoch 5: Train Loss: 10.6019, Train Acc: 0.0000, Val Loss: 10.4293, Val Acc: 0.0000


Epoch 6/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.52it/s, loss=10.3971, acc=0.0000]
Epoch 6/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 72.57it/s, loss=10.3031, acc=1.0000]


Epoch 6: Train Loss: 10.4226, Train Acc: 0.0000, Val Loss: 10.3031, Val Acc: 1.0000


Epoch 7/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.08it/s, loss=10.2346, acc=0.3333]
Epoch 7/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 70.91it/s, loss=10.1831, acc=1.0000]


Epoch 7: Train Loss: 10.2679, Train Acc: 0.4286, Val Loss: 10.1831, Val Acc: 1.0000


Epoch 8/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.31it/s, loss=10.3554, acc=0.0000]
Epoch 8/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 53.44it/s, loss=10.0742, acc=1.0000]


Epoch 8: Train Loss: 10.2749, Train Acc: 0.2857, Val Loss: 10.0742, Val Acc: 1.0000


Epoch 9/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.10it/s, loss=10.1604, acc=0.3333]
Epoch 9/15 [Val]: 100%|█████████████████████████████████████████| 1/1 [00:00<00:00, 71.84it/s, loss=9.9795, acc=1.0000]


Epoch 9: Train Loss: 10.1371, Train Acc: 0.5714, Val Loss: 9.9795, Val Acc: 1.0000


Epoch 10/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.29it/s, loss=9.9988, acc=1.0000]
Epoch 10/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 72.28it/s, loss=9.8755, acc=1.0000]


Epoch 10: Train Loss: 10.0467, Train Acc: 1.0000, Val Loss: 9.8755, Val Acc: 1.0000


Epoch 11/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.23it/s, loss=9.8719, acc=1.0000]
Epoch 11/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 53.68it/s, loss=9.7682, acc=1.0000]


Epoch 11: Train Loss: 9.9612, Train Acc: 0.8571, Val Loss: 9.7682, Val Acc: 1.0000


Epoch 12/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.02it/s, loss=9.8727, acc=1.0000]
Epoch 12/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 46.32it/s, loss=9.6608, acc=1.0000]


Epoch 12: Train Loss: 9.8630, Train Acc: 1.0000, Val Loss: 9.6608, Val Acc: 1.0000


Epoch 13/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.46it/s, loss=9.6927, acc=1.0000]
Epoch 13/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 45.64it/s, loss=9.5167, acc=1.0000]


Epoch 13: Train Loss: 9.7430, Train Acc: 1.0000, Val Loss: 9.5167, Val Acc: 1.0000


Epoch 14/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.32it/s, loss=9.5922, acc=1.0000]
Epoch 14/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 64.58it/s, loss=9.3605, acc=1.0000]


Epoch 14: Train Loss: 9.6096, Train Acc: 1.0000, Val Loss: 9.3605, Val Acc: 1.0000


Epoch 15/15 [Train]: 100%|██████████████████████████████████████| 2/2 [00:00<00:00,  8.35it/s, loss=9.3771, acc=1.0000]
Epoch 15/15 [Val]: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 65.38it/s, loss=9.2278, acc=1.0000]


Epoch 15: Train Loss: 9.4621, Train Acc: 1.0000, Val Loss: 9.2278, Val Acc: 1.0000
模型已保存到 'improved_codebert_translator.pth'

测试模型...
测试样本数量: 1


NameError: name 'improved_evaluate_model' is not defined