In [1]:
import pandas as pd
import numpy as np
import random
from transformers import LlamaForCausalLM, LlamaTokenizer


random.seed(93)
np.random.seed(93)


summarizer_type = "vertex-ai"


topic_df = pd.read_parquet("gs://scraped-news-article-data-null/2023-topics-%s.parquet" % summarizer_type)
topic_df.head()

Unnamed: 0,topics,summary
0,0,"Google, Microsoft, and OpenAI are all investin..."
1,1,"Oil prices rise on China demand hopes, OPEC+ o..."
2,2,TikTok is facing increasing scrutiny from gove...
3,3,The Federal Reserve is expected to raise inter...
4,4,"The US labor market is still strong, with job ..."


In [2]:
topic_existing_sum = topic_df.loc[(topic_df.summary.str.len() > 0) & (topic_df.summary.str.lower().str.strip() != "no theme")]
topic_existing_sum.summary.head()

0    Google, Microsoft, and OpenAI are all investin...
1    Oil prices rise on China demand hopes, OPEC+ o...
2    TikTok is facing increasing scrutiny from gove...
3    The Federal Reserve is expected to raise inter...
4    The US labor market is still strong, with job ...
Name: summary, dtype: object

In [3]:
BASE_PROMPT = "A numbered list of summaries of topics from news articles is give below. " +\
"Using only information in the summaries, and without speculating, identify the topics from the list that can be further investigated to answer the inquiry below. " +\
"In addition, explain why the topics identified are selected. " + \
'The answer should be formatted as:\nRELEVANT TOPICS: 1(2/10), 5(6/10), 10(1/10)\nREASON: reason for selecting topics 1, 5, 10\nwhere 1, 5, and 10 are the selected topics from the list. (2/10) is the probability that topic 1 is relevant to the inquiry.\n\nInquiry: %s\nList of Topics:\n'


def generate_topic_prompts(summary_df, inquiry, limiter, base_prompt=BASE_PROMPT):
    start_idx = 0
    end_idx = len(summary_df.index)
    
    row_format = "%d. %s\n"
    while start_idx < end_idx:
        prompt = base_prompt % inquiry
        current_idx = start_idx
        while limiter(prompt) and current_idx < end_idx:
            topic_id = summary_df.iloc[current_idx]["topics"]
            topic_summary = summary_df.iloc[current_idx]["summary"]
            prompt = prompt + row_format % (topic_id, topic_summary)
            current_idx = current_idx + 1
        start_idx = current_idx
        yield prompt

In [4]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel
import re
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff
import tiktoken



def create_palm2_filter(prompt=BASE_PROMPT, temperature=0.7, max_new_tokens=1024, max_prompt_tokens=1024):
    cache_dir = "/home/jupyter/models"
    tokenizer = LlamaTokenizer.from_pretrained("/home/jupyter/koala_transformer", device_map="auto", cache_dir=cache_dir,  max_input_size=6656)
    project_id = "msca310019-capstone-f945"
    model_name = "text-bison@001"
    location = "us-central1"
    matcher = re.compile(r"\<SUMMARY\>(?P<summary>.+)\<\/SUMMARY\>")
    
    """Predict using a Large Language Model."""
    vertexai.init(project=project_id, location=location)
    model = TextGenerationModel.from_pretrained(model_name)

    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def predict_large_language_model_sample(
        temperature: float,
        max_decode_steps: int,
        top_p: float,
        top_k: int,
        content: str,
        ):
        
        response = model.predict(
            content,
            temperature=temperature,
            max_output_tokens=max_decode_steps,
            top_k=top_k,
            top_p=top_p)
        return response
    
    def token_count_limiter(prompt):
        tokens = tokenizer(prompt, return_tensors="pt")
        token_count = tokens["input_ids"].shape[1]
        return token_count <= max_prompt_tokens
    
    def filter_topics(topic_df, inquiry):
        
        for p in generate_topic_prompts(topic_df, inquiry, token_count_limiter, base_prompt=prompt):
            result = predict_large_language_model_sample(temperature=temperature, max_decode_steps=max_new_tokens, top_p=0.8, top_k=40, content=p)
            result = str(result)
            print(result)
    
    return filter_topics

In [5]:
import openai
import tiktoken


with open("/home/jupyter/apikey", "r") as api_fp:
    api_key = api_fp.read().strip()
    

def create_openai_summarizer(api_key, prompt=BASE_PROMPT, temperature=0.7, max_new_tokens=1024, max_prompt_tokens=3000):
    openai.api_key = api_key
    engine = "gpt-3.5-turbo"
    encoding = tiktoken.encoding_for_model(engine)
    matcher = re.compile(r"SUMMARY:(?P<summary>(.|\n)+)EXPLANATION")
    
    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def completion_with_backoff(**kwargs):
        return openai.ChatCompletion.create(**kwargs)
    
    def token_count_limiter(prompt):
        tokens = encoding.encode(prompt)
        return len(tokens) <= max_prompt_tokens
    
    def filter_topics(topic_df, inquiry):
        
        for p in generate_topic_prompts(topic_df, inquiry, token_count_limiter, base_prompt=prompt):
            messages=[
                {'role': 'user', 'content': p},
            ]
            result = completion_with_backoff(messages=messages, model=engine, temperature=temperature, max_tokens=max_new_tokens)
            result_text = result['choices'][0]['message']['content']
            print(result_text)
            
    return filter_topics

In [6]:
filter_llm = create_openai_summarizer(api_key, temperature=0)

In [7]:
filter_llm(topic_existing_sum, "Which companies are making progress on AI?")

RELEVANT TOPICS: 0(8/10), 59(6/10)
REASON: Topic 0 discusses companies investing heavily in AI, which is directly related to the inquiry. Topic 59 discusses AI startup funding, which could provide information on which companies are making progress in AI.
RELEVANT TOPICS: 98(4/10), 109(2/10)
REASON: Intel's changes may indicate progress on AI, Novo Nordisk's obesity drug may involve AI technology


In [8]:
import torch


def create_koala_topic_summarizer(prompt=BASE_PROMPT, temperature=0.7, max_new_tokens=512, max_prompt_tokens=1536):
    cache_dir = "/home/jupyter/models"
    tokenizer = LlamaTokenizer.from_pretrained("/home/jupyter/koala_transformer", device_map="auto", cache_dir=cache_dir,  max_input_size=2048)
    model = LlamaForCausalLM.from_pretrained("/home/jupyter/koala_transformer", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto", cache_dir=cache_dir)
    
    def generate_koala_prompt(prompt):
        system = """BEGINNING OF CONVERSATION: """
        template = system + "USER: %s GPT:"
        return template % prompt
    
    def generate_from_tokens(tokens):
        outputs = model.generate(**tokens,
                             do_sample=True, 
                             top_p=1.0,
                             temperature=temperature,
                             max_new_tokens=max_new_tokens)
        result = tokenizer.decode(outputs[0][tokens["input_ids"].shape[1]:], skip_special_tokens=True)
        return "".join(result)
    
    def token_count_limiter(final_prompt):
        final_prompt = generate_koala_prompt(final_prompt)
        tokens = tokenizer(final_prompt, return_tensors="pt")
        token_count = tokens["input_ids"].shape[1]
        return token_count <= max_prompt_tokens
    
    def filter_topics(topic_df, inquiry):
        
        for p in generate_topic_prompts(topic_df, inquiry, token_count_limiter, base_prompt=prompt):
            final_prompt = generate_koala_prompt(p)
            tokens = tokenizer(final_prompt, return_tensors="pt").to("cuda")
            print(generate_from_tokens(tokens))
    
    return filter_topics

In [9]:
filter_koala = create_koala_topic_summarizer(temperature=0.01)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
filter_koala(topic_existing_sum, "Which companies are making progress on AI?")

RELEVANT TOPICS: 1, 5, 10

REASON:

1.   The topic of the first article is the performance of the FTSE 100, which is a stock market index of the 100 largest companies listed on the London Stock Exchange. The index is a good indicator of the overall health of the UK economy, and the articles discuss the performance of the index and the companies listed on it.
2.   The topic of the second article is the performance of the FTSE 100, which is a stock market index of the 100 largest companies listed on the London Stock Exchange. The index is a good indicator of the overall health of the UK economy, and the articles discuss the performance of the index and the companies listed on it.
3.   The topic of the third article is the performance of the FTSE 100, which is a stock market index of the 100 largest companies listed on the London Stock Exchange. The index is a good indicator of the overall health of the UK economy, and the articles discuss the performance of the index and the companies li