In [1]:
import transformers
import torch 
import gc 
import json
from datasets import Dataset 
from tokens import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gc.collect()
torch.cuda.empty_cache()

In [3]:
# Load the model and tokenizer with the access token
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    padding_side="left"
)

# Create the pipeline with the specified model and tokenizer
pipeline = transformers.pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer
)

2024-11-25 16:53:45.186362: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-25 16:53:45.202205: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-25 16:53:45.221329: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-25 16:53:45.227186: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-25 16:53:45.241166: I tensorflow/core/platform/cpu_feature_guar

In [4]:
# Repeater
def repeater(input, batch_size = 1):
    
    terminators = [
        pipeline.tokenizer.eos_token_id,
    ]

    # Generate text
    outputs = pipeline(
        input,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.5,
        top_p=0.9,
        batch_size=batch_size,
        max_new_tokens=2024
    )
    return outputs


In [5]:
# Load Articles 
with open('articles.json', 'r') as file:
    articles = json.load(file)

print(f'{len(articles[len(articles) - 1])} Topics of articles are available.')

5 Topics of articles are available.


In [6]:
def format_chat_template(document):
    instruction = """Being provided a news article:\n
                     1. Summarize the key points of the article.\n
                     2. Include secondary details from the article.\n
                     3. Preserve the author's tone and point of view. """
    
    row_json = [{"role": "user", "content": f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{document}\n\n### Response:\n"}]
    
    return tokenizer.apply_chat_template(row_json, tokenize=False)

In [6]:
def format_chat_template(document):
    instruction = """Being provided a news article, summarize the key points of the article, including secondary details 
                     and preserving the author's tone and point of view in the topic. """
    
    row_json = [{"role": "user", "content": f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{document}\n\n### Response:\n"}]
    
    return tokenizer.apply_chat_template(row_json, tokenize=False)

In [7]:
# Summarized articles 
sum_articles = []

# String where response starts
start_response = "### Response:assistant"

# Loop through every topic 
for topic in articles: 
    sum_topic = []

    # data for batching 
    data = []

    for content in topic: 
        # Prompting the content 
        input = format_chat_template(content)
        data.append(input)

    # Applying batching for each topic of articles 
    output = repeater(data,batch_size=len(data))

    for content in output:
        # Extracting response 
        response_id = content["summary_text"].find(start_response)

        # Saving articles 
        sum_topic.append(content["summary_text"][response_id + len(start_response):])
        
    # Saving the summary topic of articles
    sum_articles.append(sum_topic)

In [46]:
# Summarized articles 
sum_articles = []

# String where response starts
start_response = "### Response:assistant"

# Loop through every topic 
for topic in articles: 
    sum_topic = []
    for content in topic: 
        # Prompting the content 
        input = format_chat_template(content)
        output = repeater(input)
        print(output)
        # Extracting response 
        response_id = output[0]["summary_text"].find(start_response)

        # Saving articles 
        sum_topic.append(output[0]["summary_text"][response_id + len(start_response):])
    
    # Saving the summary topic of articles
    sum_articles.append(sum_topic)

[{'summary_text': "system\n\nCutting Knowledge Date: December 2023\nToday Date: 25 Nov 2024\n\nuser\n\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nBeing provided a news article:\n\n                     1. Summarize the key points of the article.\n\n                     2. Include secondary details from the article.\n\n                     3. Preserve the author's tone and point of view. \n\n### Input:\n{'content': 'Cathie Wood says Elon Musk will succeed in his audit of the federal government because he has ‘more proprietary data’ than anyone\\nARK Invest CEO Cathie Wood backs Tesla CEO Elon Musk in his new role spearheading an audit committee for the federal government.\\n\\n“I think he’s going to bring a lot of efficiency to government,” Wood told CNBC Friday morning.\\n\\nEarlier this week, President-elect Donald Trump appointed Musk to head what he called the Department of Government Efficiency (DOGE

In [8]:
# Write summarized articles 
with open("output2.json", "w") as file: 
  json.dump(sum_articles, file, indent=1)

In [None]:
del outputs
del model