In [1]:
import sys
import pandas as pd


sys.path.insert(0, '..')
summarizer_type = "openai"


topic_df = pd.read_parquet("gs://scraped-news-article-data-null/2023-topics-%s.parquet" % summarizer_type)
topic_df.head()

Unnamed: 0,topics,summary
0,0,The news articles revolve around the advanceme...
1,1,The news articles cover the fluctuations in oi...
2,2,The news articles revolve around the security ...
3,3,The Federal Reserve's interest rate decisions ...
4,4,The news articles cover various aspects of the...


In [2]:
topic_existing_sum = topic_df.loc[(topic_df.summary.str.len() > 0) & (topic_df.summary.str.lower().str.strip() != "no theme")]
topic_existing_sum = topic_existing_sum.loc[topic_existing_sum.topics <= 300]
topic_existing_sum.summary.head()

0    The news articles revolve around the advanceme...
1    The news articles cover the fluctuations in oi...
2    The news articles revolve around the security ...
3    The Federal Reserve's interest rate decisions ...
4    The news articles cover various aspects of the...
Name: summary, dtype: object

In [3]:
from summarizer.topic_sum import create_topic_filter

In [4]:
with open("/home/jupyter/apikey", "r") as api_fp:
    api_key = api_fp.read().strip()


filter_llm = create_topic_filter(api_key=api_key, temperature=0)

In [5]:
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper
from llama_index.storage.storage_context import StorageContext
from llama_index.llm_predictor import HuggingFaceLLMPredictor
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from IPython.display import Markdown, display
from llama_index import ServiceContext, LangchainEmbedding
import os
import pandas as pd
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from llama_index.prompts.prompts import SimpleInputPrompt
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any


system_prompt = """BEGINNING OF CONVERSATION: """ 
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("USER: {query_str} GPT:")
query_wrapper_prompt.format(query_str="test")
tokenizer = LlamaTokenizer.from_pretrained("/home/jupyter/koala_transformer", device_map="auto", max_input_size=2048)
model = LlamaForCausalLM.from_pretrained("/home/jupyter/koala_transformer", torch_dtype=torch.float16, device_map="auto", cache_dir="/home/jupyter/data/transformers")


class CustomLLM(LLM):

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        prompt = query_wrapper_prompt.format(query_str=prompt)
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=1024, 
                             do_sample=False, 
                             temperature=0)
        result = tokenizer.batch_decode(outputs, skip_special_tokens=True, spaces_between_special_tokens=False)[0]
        # only return newly generated tokens
        return result[len(prompt):]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"name_of_model": "Koala"}

    @property
    def _llm_type(self) -> str:
        return "llama"

    
# define our LLM
num_output = 512
max_chunk_overlap = 20
llm_predictor = LLMPredictor(llm=CustomLLM())


prompt_helper = PromptHelper(max_input_size=2048, num_output=num_output, max_chunk_overlap=max_chunk_overlap)
hf_predictor = LLMPredictor(llm=CustomLLM())
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))
service_context = ServiceContext.from_defaults(chunk_size_limit=512, llm_predictor=hf_predictor, embed_model=embed_model, prompt_helper=prompt_helper)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from llama_index import load_index_from_storage, load_indices_from_storage, load_graph_from_storage
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores import SimpleVectorStore
from llama_index.storage.index_store import SimpleIndexStore
from pathlib import Path


persist_dir_base = "/home/jupyter/topic_indices"


def create_llama_index_lazyloader(directory=persist_dir_base):
    index_dict = {}
    
    def get_llama_index(topic_number):
        if topic_number not in index_dict:
            topic_index_dir = Path(directory, str(topic_number))
            storage_context = StorageContext.from_defaults(
                docstore=SimpleDocumentStore.from_persist_dir(persist_dir=topic_index_dir),
                vector_store=SimpleVectorStore.from_persist_dir(persist_dir=topic_index_dir),
                index_store=SimpleIndexStore.from_persist_dir(persist_dir=topic_index_dir),
            )
            index = load_index_from_storage(storage_context=storage_context, service_context=service_context)
            index_dict[topic_number] = index
        return index_dict[topic_number]
            
    return get_llama_index
        
lazy_loader = create_llama_index_lazyloader()

In [10]:
from llama_index import QuestionAnswerPrompt


inquiry = "Which companies are making progress on artifical intelligence?"


PROMPT = "Several extracts from news articles are provided below. " + \
"Using only information from the news extracts, write a well written report with a headline that can answer the given inquiry. " + \
'If the extracts do not contain sufficient information to answer the inquiry, write "INSUFFICIENT INFORMATION".\n\n' + \
"Inquiry: {query_str}\n\nNews Article Extracts:\n{context_str}\n"
QA_PROMPT = QuestionAnswerPrompt(PROMPT)



for r in filter_llm(topic_existing_sum, "Which companies are making progress on artifical intelligence?"):
    if r.rating < 0.5:
        continue
    print("Identified relevant topic #: " + r.topic_number)
    index = lazy_loader(r.topic_number)
    query_engine = index.as_query_engine(text_qa_template=QA_PROMPT)
    print(query_engine.query(inquiry))

Identified relevant topic #: 0
 The recent launch of Google's Bard and Microsoft's Bing chat have brought more tech giants into the generative AI space, with half of the companies surveyed by ResumeBuilder using OpenAI's ChatGPT. The post-pandemic job market has also led to the adoption of AI products, with companies like OpenAI and Stability AI participating in a public evaluation of their AI systems. However, the United States regulators have fallen short of European governments in crafting strong rules on deepfakes and misinformation. The President Joe Biden administration has also released an AI Bill of Rights and a risk management plan for AI use.
Identified relevant topic #: 59
 The report on the progress of artificial intelligence companies is a mix of positive and negative news. On the positive side, several companies have raised significant funding and are making progress in developing enterprise applications. For example, Adept, a startup that focuses on training a neural net