In [1]:
from keybert import KeyBERT

with open('data/guidetoinvestors.txt') as f:
    pg_work = f.read()

keybert_model = KeyBERT()
keywords = keybert_model.extract_keywords(
    pg_work, 
    keyphrase_ngram_range=(1, 2), 
    stop_words='english'
)

keywords = [keyword[0] for keyword in keywords]
keywords

  from .autonotebook import tqdm as notebook_tqdm


['investors hackers',
 'investors startups',
 'startup investors',
 'hackers invest',
 'invest hackers']

In [None]:
OPENAI_API_KEY = "OPENAI-API-KEY"

In [3]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import SequentialChain
from langchain.memory import SimpleMemory

### SUMMARIZE DOCUMENT
llm_summarizer = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)

texts = text_splitter.split_text(pg_work)
docs = [Document(page_content=t) for t in texts[:3]]

summarizer_chain = load_summarize_chain(
                    llm_summarizer, 
                    chain_type="map_reduce", 
                    output_key="summary")

### REFORMAT KEYWORDS
OPENAI_MODELS = ["gpt-4", "gpt-3.5-turbo"]
llm_reformatter = OpenAI(temperature=0, model_name=OPENAI_MODELS[0], openai_api_key=OPENAI_API_KEY)

PROMPT = PromptTemplate(
    input_variables=["keywords", "summary"],
    template = "Given a list of keywords: {keywords}, \
        filter out the similar or overlapping keywords, \
        reformat the keywords to match the context of the text: {summary} \
        Feel free to add any additional keywords that you think are relevant. \
        Do note that the extracted keywords will be later used for key concept note and summary purposes. \
        Therefore, only include the necessary keywords. \
        Each keyword should be a SINGLE WORD or a VALID PHRASE. \
        DO NOT include keywords containing adjectives or any form of decorative words. \
        TAKE OUT adjectives. \
        Each keyword should be a valid word or a phrase, that fits context. \
        If the keyword contains more than a single word, say: \
        ''' \
        wordA wordB \
        ''' \
        other keywords sharing similar combinations, say: \
        ''' \
        wordB word A \
        ''' \
        should be filtered out. \
        Return the keywords in a list. \
        Example: \
        ''' \
        ['keyword1', 'keyword2', 'keyword3' ... and so on] \
        ''' \
        "
)

reformatter_chain = LLMChain(llm=llm_reformatter, prompt=PROMPT)

### MAIN CHAIN
main_chain = SequentialChain(
    memory=SimpleMemory(memories={"keywords": keywords}),
    chains = [summarizer_chain, reformatter_chain],
    input_variables=["input_documents"],
    verbose=True)

response = main_chain.run({"input_documents": docs})
print(response)





[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
['angel investors', 'startup hub', 'Silicon Valley', 'venture capitalists', 'Google', 'momentum investors', 'stock']


In [4]:
import ast
keywords = ast.literal_eval(response)
print(keywords)

['angel investors', 'startup hub', 'Silicon Valley', 'venture capitalists', 'Google', 'momentum investors', 'stock']
