In [1]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

In [2]:
import pandas as pd
import numpy as np
import random


random.seed(93)
np.random.seed(93)


topic_df = pd.read_parquet("gs://scraped-news-article-data-null/2023-topics.parquet")
topic_df.head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type,topic,probability
0,reuters,29667,Europe,European shares rise on upbeat cues from Powel...,2023-02-08T17:11:00,Feb 8 (Reuters) - European shares rose on Wedn...,* \n* STOXX 600 pulls back from 9-mth high to ...,BULLETS,-1,0.0
1,reuters,79619,China,China Evergrande debt restructuring incentive ...,2023-04-27T06:05:00,"HONG KONG, April 27 (Reuters) - Embattled prop...",,,310,0.281727
2,reuters,48010,European Markets,European stocks rally as banking worries fade,2023-03-30T16:27:00,March 30 (Reuters) - European stocks rose to n...,* \n* H&M posts surprise profit in Dec-Feb\n* ...,BULLETS,77,1.0
3,reuters,67859,Aerospace & Defense,China gears up to compete with SpaceX's Starli...,2023-03-02T10:29:00,"BEIJING, March 2 (Reuters) - China's military-...",,,-1,0.0
4,reuters,109054,World,Russia's war on Ukraine latest: Moscow denies ...,2023-05-12T05:13:00,May 11 (Reuters) - Russia's defence ministry o...,,,719,1.0


In [3]:
from datetime import datetime


BASE_PROMPT = """A list of news article titles with the published time is given below. Using only the provided information, summarize the theme of the titles such that it will be easy to answer investing related questions from the summary. Be specific about the dates and entities involved and try to not omit important details. Do not use vague terms such as "past few month" or "various companies".\n\n"""


def select_titles(topic_df, topic, limiter, base_prompt=BASE_PROMPT):
    topic_segment = topic_df.loc[topic_df.topic == topic][["title", "published", "probability"]].sort_values(by="probability", ascending=False)
    if len(topic_segment.index) == 0:
        raise KeyError("Invalid Topic: " + str(topic))
    
    current_idx = 0
    end_idx = len(topic_segment.index)
    prompt = base_prompt
    row_format = "Published at: %s Title: %s\n"
    while limiter(prompt) and current_idx < end_idx:
        timestamp = datetime.fromisoformat(topic_segment.iloc[current_idx]["published"])
        timestamp_str = timestamp.strftime("%m/%d/%Y")
        prompt = prompt + row_format % (timestamp_str, topic_segment.iloc[current_idx]["title"])
        current_idx = current_idx + 1
    return prompt


def create_koala_topic_summarizer(prompt=BASE_PROMPT, temperature=0.7, max_new_tokens=512, max_prompt_tokens=1536):
    cache_dir = "/home/jupyter/models"
    tokenizer = LlamaTokenizer.from_pretrained("/home/jupyter/koala_transformer", device_map="auto", cache_dir=cache_dir,  max_input_size=2048)
    model = LlamaForCausalLM.from_pretrained("/home/jupyter/koala_transformer", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto", cache_dir=cache_dir)
    
    def generate_koala_prompt(prompt):
        system = """BEGINNING OF CONVERSATION: """
        template = system + "USER: %s GPT:"
        return template % prompt
    
    def generate_from_tokens(tokens):
        outputs = model.generate(**tokens,
                             do_sample=True, 
                             top_p=1.0,
                             num_beams=1,
                             top_k=50,
                             temperature=temperature,
                             max_new_tokens=max_new_tokens)
        result = tokenizer.decode(outputs[0][tokens["input_ids"].shape[1]:], skip_special_tokens=True)
        return "".join(result)
    
    def token_count_limiter(final_prompt):
        final_prompt = generate_koala_prompt(final_prompt)
        tokens = tokenizer(final_prompt, return_tensors="pt")
        token_count = tokens["input_ids"].shape[1]
        return token_count <= max_prompt_tokens
    

    def summarize_topics(topic_df, topic_number):
        final_prompt = select_titles(topic_df, topic_number, token_count_limiter, base_prompt=prompt)
        final_prompt = generate_koala_prompt(final_prompt)
        tokens = tokenizer(final_prompt, return_tensors="pt").to("cuda")
        return generate_from_tokens(tokens)
    
    return summarize_topics


In [4]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel
import re


API_PROMPT = "A list of news article titles with the published time is given below. " +\
"Using only the provided information, summarize the theme of the titles such that it will be easy to answer investing related questions from the summary. " +\
"Be specific about the dates and entities involved. " +\
"Be concise in writing the summary, but try not to omit important details. " +\
'Do not use vague terms such as "past few months", "various companies", or "the disease". Use the actual names of the entities such as companies, products, etc if possible. ' +\
'Finally, explain how the summary can be used to answer investing related questions. ' +\
'If a clear theme relevant to investing is not present, write "NO THEME" as the summary, and explain the reasons. ' +\
'The format of the output should be: <SUMMARY>the summary</SUMMARY><EXPLAINATION>the explanation</EXPLAINATION>\n\n'

def create_palm2_summarizer(prompt=API_PROMPT, temperature=0.7, max_new_tokens=1024, max_prompt_tokens=2048):
    cache_dir = "/home/jupyter/models"
    tokenizer = LlamaTokenizer.from_pretrained("/home/jupyter/koala_transformer", device_map="auto", cache_dir=cache_dir,  max_input_size=6656)
    project_id = "msca310019-capstone-f945"
    model_name = "text-bison@001"
    location = "us-central1"
    matcher = re.compile(r"\<SUMMARY\>(?P<summary>.+)\<\/SUMMARY\>")
    
    """Predict using a Large Language Model."""
    vertexai.init(project=project_id, location=location)
    model = TextGenerationModel.from_pretrained(model_name)

    def predict_large_language_model_sample(
        temperature: float,
        max_decode_steps: int,
        top_p: float,
        top_k: int,
        content: str,
        ):
        
        response = model.predict(
            content,
            temperature=temperature,
            max_output_tokens=max_decode_steps,
            top_k=top_k,
            top_p=top_p)
        return response
    
    def summarize_topics(topic_df, topic_number):
        
        current_max_token = max_prompt_tokens
        
        while True:
            def token_count_limiter(prompt):
                tokens = tokenizer(prompt, return_tensors="pt")
                token_count = tokens["input_ids"].shape[1]
                return token_count <= current_max_token
            final_prompt = select_titles(topic_df, topic_number, token_count_limiter, base_prompt=prompt)
            result = predict_large_language_model_sample(temperature=temperature, max_decode_steps=max_new_tokens, top_p=0.8, top_k=40, content=final_prompt)
            result = str(result)
            match = matcher.search(result)
            if match:
                return match.group("summary")
            else:
                current_max_token = current_max_token - 16
    
    return summarize_topics

In [5]:
import openai
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff
import tiktoken


with open("/home/jupyter/apikey", "r") as api_fp:
    api_key = api_fp.read().strip()

    
OPENAI_PROMPT = "A list of news article titles with the published time is given below. " +\
"Using only the provided information, summarize the theme of the titles such that it will be easy to answer investing related questions from the summary. " +\
"Be specific about the dates and entities involved. " +\
"Be concise in writing the summary, but try not to omit important details. " +\
'Do not use vague terms such as "past few months", "various companies", or "the disease". Use the actual names if possible. ' +\
'Finally, explain how the summary can be used to answer investing related questions. ' +\
'If a clear theme relevant to investing is not present, maintain the specified format, use "SUMMARY: NO THEME" as the summary, and explain the reasons. ' +\
'The format of the output should be:\nSUMMARY:\nthe summary\nEXPLANATION:\nthe explanation\n\n'
    

def create_openai_summarizer(api_key, prompt=OPENAI_PROMPT, temperature=0.7, max_new_tokens=1024, max_prompt_tokens=3000):
    openai.api_key = api_key
    engine = "gpt-3.5-turbo"
    encoding = tiktoken.encoding_for_model(engine)
    matcher = re.compile(r"SUMMARY:(?P<summary>(.|\n)+)EXPLANATION")
    
    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def completion_with_backoff(**kwargs):
        return openai.ChatCompletion.create(**kwargs)
    
    def token_count_limiter(prompt):
        tokens = encoding.encode(prompt)
        return len(tokens) <= max_prompt_tokens
    
    def summarize_topic(topic_df, topic_number):
        final_prompt = select_titles(topic_df, topic_number, token_count_limiter, base_prompt=prompt)
        messages=[
            {'role': 'user', 'content': final_prompt},
        ]
        result = completion_with_backoff(messages=messages, model=engine, temperature=temperature, max_tokens=max_new_tokens)
        result_text = result['choices'][0]['message']['content']
        match = matcher.search(result_text)
        if match:
            return match.group("summary").strip()
        else:
            raise AssertionError("Did not generate correct response:\n" + result_text)
            
    return summarize_topic
    

In [6]:
def create_topic_summarizer(kind="vertex-ai", **kwargs):
    if kind == "koala":
        return create_koala_topic_summarizer(temperature=0.01)
    if kind == "vertex-ai":
        return create_palm2_summarizer(temperature=0)
    if kind == "openai":
        return create_openai_summarizer(api_key, temperature=0)
    raise ValueError("Invalid kind: " + kind)

In [7]:
api_summarizer = create_topic_summarizer()
test = api_summarizer(topic_df, 0)
print(test)

Microsoft and Google are investing heavily in A.I., and there are many potential applications for A.I. in the workplace. Investors should keep an eye on these developments as they could have a significant impact on the future of work.


In [8]:
openai_summarizer = create_topic_summarizer(kind="openai")
test = openai_summarizer(topic_df, 0)
print(test)

The articles discuss various developments and applications of artificial intelligence (AI) technology, particularly in the form of generative language models like ChatGPT and Bard. Companies like Google, Microsoft, and OpenAI are investing heavily in AI and developing new products and features that incorporate the technology. AI is being used in a variety of industries, including finance, real estate, and healthcare, and is seen as a potential tool for improving productivity and employee engagement. However, there are also concerns about the risks and ethical implications of AI, and regulators are beginning to investigate the technology.


In [9]:
topics = list(set(topic_df.topic.unique()) - {-1})
print(len(topics))

1564


In [10]:
from tqdm import tqdm


tqdm.pandas()

In [11]:
summarizer_type = "openai"


try:
    topic_sum = pd.read_parquet("gs://scraped-news-article-data-null/2023-topics-%s.parquet" % summarizer_type)
except:
    topic_sum = pd.DataFrame({
        "topics": topics,
        "summary": ["" for _ in topics]
    })

topic_sum.head()

Unnamed: 0,topics,summary
0,0,The news articles revolve around the advanceme...
1,1,The news articles cover the fluctuations in oi...
2,2,The news articles revolve around the security ...
3,3,The Federal Reserve's interest rate decisions ...
4,4,The news articles cover various aspects of the...


In [12]:
works = set(topic_sum.loc[topic_sum.summary.str.len() == 0]["topics"].to_list())
summarizer = create_topic_summarizer(summarizer_type)
with tqdm(total = len(works)) as progress:
    for i, w in enumerate(works):
        topic = topic_sum.loc[topic_sum.topics == w].iloc[0]["topics"]
        summary = summarizer(topic_df, topic)
        topic_sum.loc[topic_sum.topics == w, "summary"] = summary
        progress.update(1)
        if i % 10 == 0:
            topic_sum.to_parquet("gs://scraped-news-article-data-null/2023-topics-%s.parquet" % summarizer_type, index=False)
topic_sum.to_parquet("gs://scraped-news-article-data-null/2023-topics-%s.parquet" % summarizer_type, index=False)

 17%|█▋        | 251/1498 [1:22:20<6:49:05, 19.68s/it]

KeyboardInterrupt

