In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
import os
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Model: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU Model: Tesla P100-PCIE-16GB


In [3]:
os.environ["WANDB_DISABLED"] = "true"
print(os.path.exists("./final_model"))
model_name = "gpt2"
print(f"Loading {model_name} model and tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
model = model.to(device)

True
Loading gpt2 model and tokenizer...




In [4]:
def load_and_tokenize_datasets(debug_mode=True):

    print("Loading CNN/DailyMail dataset...")
    summarization = load_dataset("giuliadc/cnndm-filtered")
    
    print("Loading SQuAD dataset...")
    qa = load_dataset("squad")
    
    if debug_mode:
        print("Debug mode: Using small subset of data")
        summarization = {
            'train': summarization['train'].select(range(100)),
            'validation': summarization['validation'].select(range(20))
        }
        qa = {
            'train': qa['train'].select(range(100)),
            'validation': qa['validation'].select(range(20))
        }
    
  
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    

    def prepare_summarization(examples):
        texts = [f"Article: {article}\nSummary: {summary}" 
                for article, summary in zip(examples['article'], examples['highlights'])]
        return tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    
   
    def prepare_qa(examples):
        texts = [f"Question: {question}\nContext: {context}\nAnswer: {answer['text'][0]}"
                for question, context, answer in zip(examples['question'], examples['context'], examples['answers'])]
        return tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    
    print("Processing datasets...")
    tokenized_summ = summarization['train'].map(
        prepare_summarization,
        remove_columns=summarization['train'].column_names,
        batched=True
    )
    
    tokenized_qa = qa['train'].map(
        prepare_qa,
        remove_columns=qa['train'].column_names,
        batched=True
    )
    
   
    combined_dataset = concatenate_datasets([tokenized_summ, tokenized_qa])
    
    print(f"Total examples: {len(combined_dataset)}")
    return combined_dataset, tokenizer

In [5]:
combined_dataset, tokenizer = load_and_tokenize_datasets(debug_mode=True)  
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.pad_token_id = model.config.eos_token_id
model = model.to(device)

Loading CNN/DailyMail dataset...
Loading SQuAD dataset...
Debug mode: Using small subset of data
Processing datasets...
Total examples: 200


In [6]:
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=100,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none"  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
)

print("\nStarting training...")
trainer.train()

print("\nSaving model...")
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
print("\nTraining completed successfully!")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training...


Step,Training Loss



Saving model...

Training completed successfully!


In [7]:
import pandas as pd
import numpy as np

def evaluate_model(model, eval_dataset, tokenizer):
    eval_args = TrainingArguments(
        output_dir="./eval_results",
        do_train=False,
        do_eval=True,
        per_device_eval_batch_size=8,
        report_to="none"
    )
    
    evaluator = Trainer(
        model=model,
        args=eval_args,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )
    )
    
    print("Running evaluation...")
    metrics = evaluator.evaluate()
    
    perplexity = np.exp(metrics['eval_loss'])
    cross_entropy = metrics['eval_loss']
    
    print("\nEvaluation Results:")
    print(f"Perplexity: {perplexity:.2f}")
    print(f"Cross Entropy Loss: {cross_entropy:.4f}")
    
    return perplexity, cross_entropy

print("Evaluating fine-tuned model...")
perplexity, cross_entropy = evaluate_model(model, combined_dataset, tokenizer)

results = {
    'Model': ['GPT (Fine-tuned)'],
    'Perplexity': [perplexity],
    'Cross Entropy Loss': [cross_entropy]
}

comparison_df = pd.DataFrame(results)
print("\nModel Comparison Results:")
print(comparison_df)

comparison_df.to_csv('model_comparison_results.csv', index=False)

Evaluating fine-tuned model...
Running evaluation...


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Evaluation Results:
Perplexity: 18.48
Cross Entropy Loss: 2.9166

Model Comparison Results:
              Model  Perplexity  Cross Entropy Loss
0  GPT (Fine-tuned)   18.477742            2.916567


In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def load_fine_tuned_model():
    model_path = "./final_model"
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
    
    model = model.to(device)
    return model, tokenizer

# Test Articles for Summarization
test_articles = [
    # Technology
    {
        "title": "AI Breakthrough",
        "text": """OpenAI researchers have announced a major breakthrough in artificial intelligence. 
        The new model demonstrates unprecedented capabilities in understanding and generating 
        human language. In extensive testing, it showed remarkable ability to write code, 
        solve complex mathematical problems, and engage in nuanced dialogue. However, 
        researchers emphasize the importance of responsible AI development and have 
        implemented robust safety measures. The technology could revolutionize fields 
        from education to scientific research."""
    },
    
    # Science
    {
        "title": "Climate Research",
        "text": """A groundbreaking study published in Nature reveals alarming rates of Arctic ice 
        melting. Scientists have found that the rate of melting has doubled in the past 
        decade, far exceeding previous predictions. The research, conducted over five 
        years, combined satellite data with on-site measurements. If current trends 
        continue, sea levels could rise by up to two meters by 2100, threatening 
        coastal cities worldwide. The study calls for immediate action to reduce 
        greenhouse gas emissions."""
    },
    
    # Healthcare
    {
        "title": "Medical Discovery",
        "text": """Researchers at Stanford Medical Center have developed a new cancer treatment 
        that shows promising results. The therapy combines traditional immunotherapy 
        with targeted drug delivery, effectively reducing tumor size in 85% of trial 
        participants. Side effects were minimal compared to conventional treatments. 
        The breakthrough could particularly benefit patients with aggressive forms 
        of breast and lung cancer. Clinical trials are expected to expand to more 
        hospitals next year."""
    }
]

# Test QA Pairs
test_qa_sets = [
    {
        "context": """The Internet was developed in the 1960s by the United States Department of 
        Defense through its ARPANET project. Initially designed as a military 
        communication network that could survive a nuclear attack, it evolved into 
        the modern Internet by the 1990s. Tim Berners-Lee later invented the World 
        Wide Web in 1989 while working at CERN, making the Internet more accessible 
        to the general public.""",
        "questions": [
            "When was the Internet developed?",
            "What was the original purpose of ARPANET?",
            "Who invented the World Wide Web?",
            "Where was the World Wide Web invented?"
        ]
    },
    {
        "context": """Quantum computing leverages quantum mechanical phenomena like superposition 
        and entanglement to perform computations. Unlike classical computers that 
        use bits (0 or 1), quantum computers use quantum bits or qubits that can 
        exist in multiple states simultaneously. This property could potentially 
        solve certain problems exponentially faster than classical computers, 
        particularly in areas like cryptography and molecular simulation.""",
        "questions": [
            "What is quantum computing based on?",
            "How are quantum computers different from classical computers?",
            "What are the potential advantages of quantum computers?",
            "What type of problems could quantum computers solve better?"
        ]
    },
    {
        "context": """Electric vehicles (EVs) have seen rapid advancement in recent years. Modern 
        EVs can travel over 300 miles on a single charge, with some models reaching 
        400+ miles. Charging technology has also improved, with fast-charging stations 
        capable of providing 200 miles of range in just 15 minutes. The cost of EV 
        batteries has dropped by 90% since 2010, making electric vehicles increasingly 
        affordable for average consumers.""",
        "questions": [
            "What is the typical range of modern EVs?",
            "How long does fast-charging take?",
            "How much have EV battery costs changed?",
            "What improvements have been made in EV technology?"
        ]
    }
]

def generate_text(model, tokenizer, prompt, max_new_tokens=100):
    try:
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding=True,
            add_special_tokens=True
        ).to(model.device)
        
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            no_repeat_ngram_size=3,
            do_sample=True,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error generating text: {str(e)}")
        return ""

def test_model():
    try:
        print("Loading fine-tuned model...")
        model, tokenizer = load_fine_tuned_model()
        
        print("\n=== Testing Summarization Capabilities ===")
        for article in test_articles:
            print(f"\nArticle Topic: {article['title']}")
            print("Original Text:")
            print(article['text'].strip())
            
            prompt = f"Summarize this article:\n{article['text']}\nSummary:"
            generated_text = generate_text(model, tokenizer, prompt, max_new_tokens=150)
            
            print("\nGenerated Summary:")
            summary = generated_text.split("Summary:")[-1].strip() if "Summary:" in generated_text else generated_text
            print(summary)
            print("\n" + "="*80)
        
        print("\n=== Testing Question Answering Capabilities ===")
        for i, qa_set in enumerate(test_qa_sets, 1):
            print(f"\nTest Set {i}:")
            print("Context:", qa_set['context'].strip())
            
            for question in qa_set['questions']:
                prompt = f"Based on the context, answer this question:\nContext: {qa_set['context']}\nQuestion: {question}\nAnswer:"
                generated_text = generate_text(model, tokenizer, prompt, max_new_tokens=100)
                
                print(f"\nQuestion: {question}")
                answer = generated_text.split("Answer:")[-1].strip() if "Answer:" in generated_text else generated_text
                print("Generated Answer:", answer)
            print("\n" + "="*80)
            
    except Exception as e:
        print(f"Error during testing: {str(e)}")

if __name__ == "__main__":
    test_model()

Loading fine-tuned model...

=== Testing Summarization Capabilities ===

Article Topic: AI Breakthrough
Original Text:
OpenAI researchers have announced a major breakthrough in artificial intelligence. 
        The new model demonstrates unprecedented capabilities in understanding and generating 
        human language. In extensive testing, it showed remarkable ability to write code, 
        solve complex mathematical problems, and engage in nuanced dialogue. However, 
        researchers emphasize the importance of responsible AI development and have 
        implemented robust safety measures. The technology could revolutionize fields 
        from education to scientific research.





Generated Summary:
Open AI researchers have revealed a breakthrough in Artificial Intelligence, making it possible to build intelligent machines that can learn from experience and learn from their mistakes.     
The new model shows unprecedented capabilities and has an enormous impact on fields from education and medicine to health and the environment.  The model shows remarkable ability 
to write code , solve complex problems, engage in complex dialogue , and engage with nuanced dialogue .  It can create complex problems , learn from the experience of its environment , and build complex models that can quickly identify, diagnose, and deal with problems in the environment 
It can generate complex models of complex problems and generate complex solutions to complex problems . 
Its predictive modeling and predictive analytics capabilities are


Article Topic: Climate Research
Original Text:
A groundbreaking study published in Nature reveals alarming rates of Arctic ice 
        melting.