In [None]:
!pip install transformers datasets evaluate accelerate -q

In [None]:
# ==============================================
# 🚀 MEMORY OPTIMIZED - WON'T CRASH!
# ==============================================

!pip install transformers datasets evaluate accelerate -q

import pandas as pd
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import torch
from google.colab import drive
import gc

# Clear memory
gc.collect()
torch.cuda.empty_cache()

drive.mount('/content/drive')

# Load data
df = pd.read_csv("/content/drive/MyDrive/spoc-train.tsv", sep='\t')
df = df.dropna(subset=['text', 'code'])
df['text'] = df['text'].astype(str).str.strip()
df['code'] = df['code'].astype(str).str.strip()

# ✅ Start with smaller dataset
df = df.sample(n=3000, random_state=42).reset_index(drop=True)
print(f"✅ Loaded {len(df)} examples")

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
special_tokens = {
    'pad_token': '<PAD>',
    'bos_token': '<BOS>',
    'eos_token': '<EOS>',
    'additional_special_tokens': ['<SEP>', '<CODE>']
}
tokenizer.add_special_tokens(special_tokens)
print(f"✅ Vocabulary size: {len(tokenizer)}")

# Format data
def format_example(text, code):
    return f"<BOS>Pseudocode: {text}<SEP>C++ Code:<CODE>{code}<EOS>"

df['formatted'] = df.apply(lambda x: format_example(x['text'], x['code']), axis=1)

# Split
train_size = int(0.95 * len(df))
train_df = df[:train_size].reset_index(drop=True)
test_df = df[train_size:].reset_index(drop=True)

# ✅ CRITICAL: Shorter max_length to save memory!
def tokenize_function(examples):
    result = tokenizer(
        examples['formatted'],
        truncation=True,
        max_length=256,  # ✅ REDUCED FROM 512!
        padding='max_length',
        return_tensors=None
    )

    result['labels'] = [
        [tid if tid != tokenizer.pad_token_id else -100 for tid in input_ids]
        for input_ids in result['input_ids']
    ]
    return result

print("\n🔄 Tokenizing...")
train_dataset = Dataset.from_pandas(train_df[['formatted']])
test_dataset = Dataset.from_pandas(test_df[['formatted']])

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=['formatted'])
tokenized_test = test_dataset.map(tokenize_function, batched=True, remove_columns=['formatted'])
print("✅ Tokenization done!")

# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
print(f"✅ Model loaded")

# ✅ MEMORY-SAFE TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/gpt2_cpp_safe",
    eval_strategy="steps",
    eval_steps=200,  # More frequent
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # ✅ REDUCED FROM 4!
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,  # ✅ Keep only 1 checkpoint to save disk
    save_steps=400,
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,
    warmup_steps=100,
    gradient_accumulation_steps=8,  # ✅ INCREASED! Effective batch = 16
    lr_scheduler_type="cosine",
    save_strategy="steps",
    logging_first_step=True,  # ✅ Log first step
    dataloader_num_workers=0,  # ✅ Avoid multiprocessing issues
)

# Create trainer with error handling
try:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
    )

    print("\n🚀 Starting training...")
    print("="*60)
    print("💡 Memory optimized settings:")
    print("   - Batch size: 2 (reduced)")
    print("   - Max length: 256 (reduced)")
    print("   - Gradient accumulation: 8")
    print("   - Effective batch: 16")
    print("="*60)

    # Check GPU memory before training
    if torch.cuda.is_available():
        print(f"\n📊 GPU Memory before training:")
        print(f"   Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
        print(f"   Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

    # Train with try-catch
    trainer.train()

    print("\n✅ Training completed successfully!")

    # Save model
    final_model_path = "/content/drive/MyDrive/gpt2_cpp_safe_final"
    model.save_pretrained(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    print(f"✅ Model saved to {final_model_path}")

except RuntimeError as e:
    if "out of memory" in str(e):
        print("\n💥 CUDA OUT OF MEMORY ERROR!")
        print("="*60)
        print("🔧 SOLUTIONS:")
        print("1. Reduce batch_size to 1")
        print("2. Reduce max_length to 128")
        print("3. Reduce dataset to 1000 samples")
        print("4. Restart Colab runtime and try again")
        print("="*60)

        # Clear memory
        torch.cuda.empty_cache()
        gc.collect()
    else:
        print(f"\n💥 TRAINING ERROR: {e}")
        raise e

except KeyboardInterrupt:
    print("\n⏹️  Training manually stopped!")
    print("💡 You can resume with: trainer.train(resume_from_checkpoint=True)")

except Exception as e:
    print(f"\n💥 UNEXPECTED ERROR: {e}")
    print("Check the error message above for details")
    raise e

# Test generation
def generate_cpp_code(pseudocode, max_new_tokens=150):
    prompt = f"<BOS>Pseudocode: {pseudocode}<SEP>C++ Code:<CODE>"
    inputs = tokenizer(prompt, return_tensors="pt")

    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model.cuda()

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.3,
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if '<CODE>' in generated_text:
        code = generated_text.split('<CODE>')[1]
        if '<EOS>' in code:
            code = code.split('<EOS>')[0]
        return code.strip()
    return generated_text

print("\n" + "="*60)
print("🧪 TESTING")
print("="*60)

test_cases = [
    "declare integer n",
    "read two integers and print sum",
    "for loop from 1 to n",
]

for i, pseudo in enumerate(test_cases, 1):
    print(f"\n{i}. {pseudo}")
    try:
        code = generate_cpp_code(pseudo)
        print(f"   {code}")
    except Exception as e:
        print(f"   Error: {e}")

print("\n✅ All done!")

Mounted at /content/drive
✅ Loaded 3000 examples


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

✅ Vocabulary size: 50262

🔄 Tokenizing...


Map:   0%|          | 0/2850 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

✅ Tokenization done!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


✅ Model loaded


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 50259, 'bos_token_id': 50258, 'pad_token_id': 50257}.



🚀 Starting training...
💡 Memory optimized settings:
   - Batch size: 2 (reduced)
   - Max length: 256 (reduced)
   - Gradient accumulation: 8
   - Effective batch: 16

📊 GPU Memory before training:
   Allocated: 0.48 GB
   Reserved: 0.53 GB


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,1.1691,0.963872
400,0.8827,0.791081


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].



✅ Training completed successfully!
✅ Model saved to /content/drive/MyDrive/gpt2_cpp_safe_final

🧪 TESTING

1. declare integer n
   int N;

2. read two integers and print sum
   cin >> x;

3. for loop from 1 to n
   for (int i = 0;i < N.size(); ++mi) {

✅ All done!


In [None]:
!pip install evaluate sacrebleu tree_sitter -q



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m635.4/635.4 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting git+https://github.com/microsoft/CodeXGLUE.git@main#subdirectory=CodeBLEU
  Cloning https://github.com/microsoft/CodeXGLUE.git (to revision main) to /tmp/pip-req-build-vh5if8n0
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/CodeXGLUE.git /tmp/pip-req-build-vh5if8n0
  Resolved https://github.com/microsoft/CodeXGLUE.git to commit ac74a62802a0dd159b3258c78a2df8ad36cdf2b9
[31mERROR: git+https://github.com/microsoft/CodeXGLUE.git@main#subdirectory=CodeBLEU does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [None]:
# Step 16: Evaluate Model (BLEU)
from evaluate import load
bleu = load("bleu")
references, predictions = [], []

def evaluate_bleu(dataset, num_samples=50):
    """Evaluate BLEU on a subset of data"""

    for i in range(min(num_samples, len(dataset))):
        pseudo = df['text'].iloc[i]
        true_code = df['code'].iloc[i]
        pred_code = generate_cpp_code(pseudo)
        references.append([true_code])
        predictions.append(pred_code)

    results = bleu.compute(predictions=predictions, references=references)
    print(f"\n📊 BLEU Score: {results['bleu']:.4f}")

evaluate_bleu(test_df)



📊 BLEU Score: 0.1416


In [None]:
# ===========================================================
# 📘 Standalone CodeBLEU Implementation (for code evaluation)
# ===========================================================
import math
from collections import Counter
from nltk.util import ngrams

def ngram_precision(pred_tokens, ref_tokens, n):
    pred_ngrams = list(ngrams(pred_tokens, n))
    ref_ngrams = list(ngrams(ref_tokens, n))
    pred_counts = Counter(pred_ngrams)
    ref_counts = Counter(ref_ngrams)
    overlap = sum(min(pred_counts[k], ref_counts[k]) for k in pred_counts)
    return overlap / max(1, len(pred_ngrams))

def corpus_bleu(preds, refs, max_n=4):
    precisions = [0]*max_n
    total = 0
    for pred, ref in zip(preds, refs):
        pred_toks = pred.strip().split()
        ref_toks = ref.strip().split()
        total += 1
        for i in range(max_n):
            precisions[i] += ngram_precision(pred_toks, ref_toks, i+1)
    precisions = [p/total for p in precisions]
    bleu = math.exp(sum(math.log(p + 1e-9) for p in precisions)/max_n)
    return bleu

def calc_codebleu(preds, refs, alpha=0.25, beta=0.25, gamma=0.25, theta=0.25):
    """
    Simplified CodeBLEU version using n-gram + keyword + syntax + dataflow weighting.
    For C++, we approximate syntax/dataflow weights with token-based penalties.
    """
    from keyword import iskeyword

    keywords = {
        'if','else','while','for','return','break','continue','switch','case','int',
        'float','double','char','bool','class','struct','public','private','protected',
        'include','void','using','namespace','std','cin','cout'
    }

    def keyword_match(pred, ref):
        pred_k = [w for w in pred.split() if w in keywords]
        ref_k = [w for w in ref.split() if w in keywords]
        if not ref_k: return 1.0
        return len(set(pred_k) & set(ref_k)) / len(set(ref_k))

    bleu = corpus_bleu(preds, refs)
    keyword_score = sum(keyword_match(p, r) for p, r in zip(preds, refs)) / len(preds)
    syntax_score = keyword_score  # proxy for syntax
    dataflow_score = bleu * 0.9 + keyword_score * 0.1  # proxy for data dependency

    codebleu = alpha*bleu + beta*keyword_score + gamma*syntax_score + theta*dataflow_score
    return {
        "CodeBLEU": codebleu,

    }


In [None]:
# Use the standalone version
results = calc_codebleu(predictions, [r[0] for r in references])

print("\n🔹 CodeBLEU Results:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


🔹 CodeBLEU Results:
CodeBLEU: 0.4836


In [None]:
import pandas as pd

human_eval = df.DataFrame({
    "Pseudocode": eval_samples["text"],
    "Generated_Code": predictions,
    "Reference_Code": [r[0] for r in references]
})

path = "/content/drive/MyDrive/human_eval_results.csv"
human_eval.to_csv(path, index=False)
print(f"\n✅ Saved to {path}")
print("💡 Add 2 columns manually:")
print("   - Correctness (1–5)")
print("   - Readability (1–5)")
human_eval = pd.read_csv(path)
avg_correctness = human_eval["Correctness"].mean()
avg_readability = human_eval["Readability"].mean()
print(f"\n⭐ Human Evaluation Averages:")
print(f"   Correctness: {avg_correctness:.2f}")
print(f"   Readability: {avg_readability:.2f}")


NameError: name 'human_eval' is not defined

In [None]:
print("\n📘 FINAL EVALUATION REPORT")
print("="*40)
print(f"BLEU Score:        {bleu_score['score']:.2f}")
print(f"CodeBLEU Score:    {codebleu_score_dict['CodeBLEU']:.4f}")
print(f"Human Correctness: {avg_correctness:.2f}/5")
print(f"Human Readability: {avg_readability:.2f}/5")
print("="*40)


In [None]:
# ============================================================
# 🚀 STREAMLIT APP FOR GPT-2 CODE GENERATION WITH NGROK
# ============================================================
from google.colab import drive
drive.mount('/content/drive')
# Step 1: Install required packages
!pip install streamlit pyngrok transformers torch -q

# Step 2: Create Streamlit app file
app_code = '''import streamlit as st
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import os

# Page config
st.set_page_config(
    page_title="Pseudocode to Code Generator",
    page_icon="🤖",
    layout="wide"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .subtitle {
        font-size: 1.2rem;
        color: #666;
        text-align: center;
        margin-bottom: 3rem;
    }
    .code-box {
        background-color: #f0f2f6;
        padding: 1.5rem;
        border-radius: 0.5rem;
        border-left: 4px solid #1f77b4;
        font-family: 'Courier New', monospace;
    }
</style>
""", unsafe_allow_html=True)

# Cache model loading
@st.cache_resource
def load_model():
    """Load the fine-tuned GPT-2 model"""
    model_path = "/content/drive/MyDrive/gpt2_cpp_safe_final"

    with st.spinner("🔄 Loading model... This may take a moment..."):
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        model = GPT2LMHeadModel.from_pretrained(model_path)

        if torch.cuda.is_available():
            model = model.cuda()

    return tokenizer, model

# Code generation function
def generate_code(pseudocode, tokenizer, model, max_tokens=150, temperature=0.8, top_p=0.95):
    """Generate code from pseudocode"""
    prompt = f"Pseudocode: {pseudocode}\\nCode:"
    inputs = tokenizer(prompt, return_tensors="pt")

    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract code part
    if "Code:" in generated_text:
        code = generated_text.split("Code:", 1)[1].strip()
        return code
    return generated_text

# Main app
def main():
    # Header
    st.markdown('<h1 class="main-header">🤖 Pseudocode to Code Generator</h1>', unsafe_allow_html=True)
    st.markdown('<p class="subtitle">Powered by Fine-tuned GPT-2 on SPOC Dataset</p>', unsafe_allow_html=True)

    # Load model
    try:
        tokenizer, model = load_model()
        st.success("✅ Model loaded successfully!")
    except Exception as e:
        st.error(f"❌ Error loading model: {e}")
        st.info("💡 Make sure the model path is correct: `/content/drive/MyDrive/gpt2_cpp_safe_final`")
        return

    # Sidebar for settings
    with st.sidebar:
        st.header("⚙️ Generation Settings")

        max_tokens = st.slider(
            "Max Tokens",
            min_value=50,
            max_value=300,
            value=150,
            step=10,
            help="Maximum number of tokens to generate"
        )

        temperature = st.slider(
            "Temperature",
            min_value=0.1,
            max_value=2.0,
            value=0.8,
            step=0.1,
            help="Higher = more creative, Lower = more focused"
        )

        top_p = st.slider(
            "Top P (Nucleus Sampling)",
            min_value=0.1,
            max_value=1.0,
            value=0.95,
            step=0.05,
            help="Probability threshold for nucleus sampling"
        )

        st.markdown("---")
        st.markdown("### 📊 Model Info")
        st.info("""
        - **Base Model**: GPT-2
        - **Training Data**: SPOC Dataset
        - **Task**: Pseudocode → Code
        """)

    # Main content area
    col1, col2 = st.columns(2)

    with col1:
        st.header("📝 Input Pseudocode")

        # Example buttons
        st.markdown("**Quick Examples:**")
        example_col1, example_col2 = st.columns(2)

        with example_col1:
            if st.button("🔢 Even Numbers"):
                st.session_state.pseudocode = "declare integer variable n and print all even numbers from 1 to n"
            if st.button("🔄 Factorial"):
                st.session_state.pseudocode = "function to calculate factorial of a number"

        with example_col2:
            if st.button("📊 Squares List"):
                st.session_state.pseudocode = "create a list of squares of numbers from 1 to 10"
            if st.button("🔤 Count Vowels"):
                st.session_state.pseudocode = "read a string and count vowels"

        # Text input
        pseudocode = st.text_area(
            "Enter your pseudocode:",
            value=st.session_state.get('pseudocode', ''),
            height=200,
            placeholder="Example: declare integer variable n and print all even numbers from 1 to n"
        )

        # Generate button
        generate_btn = st.button("🚀 Generate Code", type="primary", use_container_width=True)

    with col2:
        st.header("💻 Generated Code")

        if generate_btn:
            if not pseudocode.strip():
                st.warning("⚠️ Please enter some pseudocode first!")
            else:
                with st.spinner("🔄 Generating code..."):
                    try:
                        generated_code = generate_code(
                            pseudocode,
                            tokenizer,
                            model,
                            max_tokens=max_tokens,
                            temperature=temperature,
                            top_p=top_p
                        )

                        # Display generated code
                        st.markdown('<div class="code-box">', unsafe_allow_html=True)
                        st.code(generated_code, language="cpp")
                        st.markdown('</div>', unsafe_allow_html=True)

                        # Copy button
                        st.download_button(
                            label="📋 Download Code",
                            data=generated_code,
                            file_name="generated_code.cpp",
                            mime="text/plain"
                        )

                    except Exception as e:
                        st.error(f"❌ Error generating code: {e}")
        else:
            st.info("👈 Enter pseudocode and click 'Generate Code' to see results")

    # Footer
    st.markdown("---")
    st.markdown("""
    <div style="text-align: center; color: #666;">
        <p>Built with ❤️ using Streamlit | Fine-tuned GPT-2 Model</p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()
'''

# Write the app to file
with open('app.py', 'w') as f:
    f.write(app_code)

print("✅ app.py created successfully!")

# Step 3: Setup ngrok authentication
from pyngrok import ngrok, conf
import getpass

print("\n" + "="*60)
print("🔐 NGROK AUTHENTICATION SETUP")
print("="*60)
print("\n1. Go to: https://dashboard.ngrok.com/get-started/your-authtoken")
print("2. Copy your authtoken")
print("3. Paste it below\n")

ngrok_token = getpass.getpass("Enter your ngrok authtoken: ")
ngrok.set_auth_token(ngrok_token)

# Step 4: Start Streamlit with ngrok
print("\n" + "="*60)
print("🚀 STARTING STREAMLIT APP")
print("="*60)

# Kill any existing Streamlit processes
!pkill -9 streamlit

# Start streamlit in background
import subprocess
import time

# Start Streamlit
streamlit_process = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", "8501"],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# Wait for Streamlit to start
time.sleep(5)

# Start ngrok tunnel
public_url = ngrok.connect(8501)

print("\n✅ Streamlit app is running!")
print(f"🌐 Public URL: {public_url}")
print(f"📱 Share this URL with anyone to access your app!")
print("\n⚠️ Note: Keep this cell running. Stopping it will close the app.")
print("="*60)

# Keep the cell running
try:
    streamlit_process.wait()
except KeyboardInterrupt:
    print("\n\n🛑 Shutting down...")
    streamlit_process.terminate()
    ngrok.disconnect(public_url)
    print("✅ App stopped successfully!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ app.py created successfully!

🔐 NGROK AUTHENTICATION SETUP

1. Go to: https://dashboard.ngrok.com/get-started/your-authtoken
2. Copy your authtoken
3. Paste it below

Enter your ngrok authtoken: ··········

🚀 STARTING STREAMLIT APP

✅ Streamlit app is running!
🌐 Public URL: NgrokTunnel: "https://mustachioed-superpiously-joycelyn.ngrok-free.dev" -> "http://localhost:8501"
📱 Share this URL with anyone to access your app!

⚠️ Note: Keep this cell running. Stopping it will close the app.






🛑 Shutting down...
✅ App stopped successfully!
