In [86]:
# The goal of the project is to experiment with different Token Selection Strategies with GPT-2 and use a carbon footprint tracking model t 
# to check how much emissoins we generated 
# https://huggingface.co/distilbert/distilgpt2?text=Once+upon+a+time,

# importing models
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from codecarbon import track_emissions


tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

In [90]:
# encoding function using GPT2 tokenizer 
def encode_text_as_pt_tensor(text):
    pt_tensors = tokenizer.encode(text, return_tensors="pt")
    return pt_tensors

print(encode_text_as_pt_tensor("This is test encoding!"))

tensor([[ 1212,   318,  1332, 21004,     0]])


In [10]:
from transformers import set_seed

set_seed(42)

In [64]:
#Ngram penalty token selection 
def generate_with_ngram_penalty(prompt, n_gram_penalty, num_beams=6):
    tokens = encode_text_as_pt_tensor(prompt)
    output = model.generate(tokens, num_beams=num_beams, no_repeat_ngram_size=n_gram_penalty, pad_token_id=tokenizer.eos_token_id)
    completion = decode_tokens(output[0])
    print(completion)
    return completion


prompt = "Languages are "
generate_with_ngram_penalty(prompt, 2)
generate_with_ngram_penalty(prompt, 3)
generate_with_ngram_penalty(prompt, 4)

Languages are vernacular, but they are also spoken in English, French, German, and Spanish.


Languages are vernacular.


The following is a list of the most common languages in the world.
Languages are vernacular.



This article is about the language. For other uses, see Language


'Languages are vernacular.\n\n\n\nThis article is about the language. For other uses, see Language'

In [72]:
# beam search token selection 
def generate_with_beam_search(prompt,num_beams):
    tokens = encode_text_as_pt_tensor(prompt)
    output = model.generate(tokens, num_beams=num_beams, pad_token_id=tokenizer.eos_token_id)
    completion = decode_tokens(output[0])
    print(completion)
    return completion


generate_with_beam_search(prompt, 2)
generate_with_beam_search(prompt, 5)
generate_with_beam_search(prompt, 15)

Languages are vernacular, but they are not.












Languages are 中文 中文 中文 中文 中文 
Languages are vernacular




















'Languages are vernacular\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

In [92]:
# sampling token selection and tracking emissions 

@track_emissions
def generate_with_sampling(prompt, temperature, top_k, n_gram_penalty=2):
    tokens = encode_text_as_pt_tensor(prompt)
    output = model.generate(tokens, no_repeat_ngram_size=n_gram_penalty, pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=temperature, top_k=top_k)
    completion = decode_tokens(output[0])
    print(f"Temperature: {temperature}\nTop K: {top_k}\n {completion}")
    return completion

generate_with_sampling(prompt, 0.6, 50)
generate_with_sampling(prompt, 0.8, 30)
generate_with_sampling(prompt, 0.9, 20)

[codecarbon INFO @ 14:56:59] [setup] RAM Tracking...
[codecarbon INFO @ 14:56:59] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 14:57:01] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i5-1235U
[codecarbon INFO @ 14:57:01] [setup] GPU Tracking...
[codecarbon INFO @ 14:57:01] No GPU found.
[codecarbon INFO @ 14:57:01] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 14:57:01] >>> Tracker's metadata:
[codecarbon INFO @ 14:57:01]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 14:57:01]   Python version: 3.12.7
[codecarbon INFO @ 14:57:01]   CodeCarbon version: 3.0.1
[codecarbon INFO @ 14:57:01]   Available RAM : 7.828 GB
[codecarbon INFO @ 14:57:01]   CPU count: 12 thread(s) in 12 physical CPU(

Temperature: 0.6
Top K: 50
 Languages are vernacular/etc. for English, French, Italian, German, Russian, Korean, and Mandarin


[codecarbon INFO @ 14:57:06] Done!

[codecarbon INFO @ 14:57:06] [setup] RAM Tracking...
[codecarbon INFO @ 14:57:06] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 14:57:08] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i5-1235U
[codecarbon INFO @ 14:57:08] [setup] GPU Tracking...
[codecarbon INFO @ 14:57:08] No GPU found.
[codecarbon INFO @ 14:57:08] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 14:57:08] >>> Tracker's metadata:
[codecarbon INFO @ 14:57:08]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 14:57:08]   Python version: 3.12.7
[codecarbon INFO @ 14:57:08]   CodeCarbon version: 3.0.1
[codecarbon INFO @ 14:57:08]   Available RAM : 7.828 GB
[codecarbon INFO @ 14:57:08]   CPU cou

Temperature: 0.8
Top K: 30
 Languages are ㅌ【㈄, and in these cases, there are a couple differences


 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 14:57:15] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i5-1235U
[codecarbon INFO @ 14:57:15] [setup] GPU Tracking...
[codecarbon INFO @ 14:57:15] No GPU found.
[codecarbon INFO @ 14:57:15] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 14:57:15] >>> Tracker's metadata:
[codecarbon INFO @ 14:57:15]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 14:57:15]   Python version: 3.12.7
[codecarbon INFO @ 14:57:15]   CodeCarbon version: 3.0.1
[codecarbon INFO @ 14:57:15]   Available RAM : 7.828 GB
[codecarbon INFO @ 14:57:15]   CPU count: 12 thread(s) in 12 physical CPU(s)
[codecarbon INFO @ 14:57:15]   CPU model: 12th Gen Intel(R) Core(TM) i5-1235U
[codecarbon INFO @ 14:57:

Temperature: 0.9
Top K: 20
 Languages are _________________

Asking for the best version of C++ is easy. In the past several


'Languages are _________________\n\nAsking for the best version of C++ is easy. In the past several'

In [94]:
import pandas as pd

emissions = pd.read_csv('emissions.csv')
print(emissions.head())

             timestamp project_name                                run_id  \
0  2025-05-16T14:48:44   codecarbon  c00151b2-5d6e-4acf-bd52-ff1aa10ec06d   
1  2025-05-16T14:56:30   codecarbon  227b5311-baa2-4623-b627-2a02fa1487fb   
2  2025-05-16T14:57:06   codecarbon  cbbe75d3-e92e-47ac-9240-fe29241dbd78   
3  2025-05-16T14:57:13   codecarbon  2dea5647-8e97-4a5d-8f61-bf5329267b84   
4  2025-05-16T14:57:19   codecarbon  19507dcf-2310-4151-8bc5-e5cea4afa989   

                          experiment_id  duration     emissions  \
0  5b0fa12a-3dd7-45bb-9766-cc326314d9f1  2.305397  2.221475e-05   
1  5b0fa12a-3dd7-45bb-9766-cc326314d9f1  0.051177  3.369301e-07   
2  5b0fa12a-3dd7-45bb-9766-cc326314d9f1  1.993381  1.914827e-05   
3  5b0fa12a-3dd7-45bb-9766-cc326314d9f1  1.141350  1.093518e-05   
4  5b0fa12a-3dd7-45bb-9766-cc326314d9f1  0.776717  7.454615e-06   

   emissions_rate  cpu_power  gpu_power  ram_power  ...  cpu_count  \
0        0.000010       42.5        0.0       10.0  ...         