In [5]:
import dotenv
import os
dotenv.load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [17]:
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
import pprint
from langchain_text_splitters import RecursiveCharacterTextSplitter
import asyncio


# Load HTML
async def load_html():
    loader = AsyncHtmlLoader(["https://en.wikipedia.org/wiki/OpenAI"])
    html = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["span"])
    return docs_transformed
docs_transformed = await load_html()
doc_string = ""
for doc in docs_transformed:
    doc_string += doc.page_content


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.40it/s]


In [69]:
import spacy
def overlapping_chunking(doc_string, max_sentences, overlapping_sentences):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(doc_string)
    sentences = [s.text for s in doc.sents]
    chunks = []
    curr_ind = 0
    while curr_ind < len(sentences):
        endpoint = min(curr_ind + max_sentences, len(sentences))
        chunk = ' '.join(sentences[curr_ind: endpoint])
        chunks.append(chunk)
        curr_ind += max_sentences - overlapping_sentences
    return chunks
chunks = overlapping_chunking(doc_string, 8, 2)
len(chunks)

211

In [70]:
max_chunk_size = float('-inf')
for chunk in chunks:
    max_chunk_size = max(max_chunk_size, len(chunk))
max_chunk_size

2513

In [47]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
print(os.environ["PYTORCH_ENABLE_MPS_FALLBACK"])

1


In [84]:
import torch
from transformers import pipeline
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device="cpu"
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
max_chunk_size = 1000
inputs = summarizer.tokenizer(doc_string, return_tensors="pt", truncation=False)
tokens = inputs.input_ids[0]
chunks = [tokens[i:i+max_chunk_size] for i in range(0, len(tokens), max_chunk_size)]
summaries = []
for chunk in chunks:
    chunk_text = summarizer.tokenizer.decode(chunk, skip_special_tokens=True)
    summary = summarizer(chunk_text, truncation=True, max_length=150)[0]['summary_text']
    summaries.append(summary)
final_summary = " ".join(summaries)
final_summary


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


1024
2310
1024
2090
1024
2732
1024
2610
1024
2577
1024
2742
1024
2756
1024
2804
1024
2678
1024
2658
1024
2740
1024
2642
1024
2540
1024
2486
1024
2695
1024
2649
1024
2614
1024
2781
1024
2715
1024
2846
1024
2599
1024
2621
1024
2488
1024
3186
1024
2656
1024
2808
1024
2705
1024
2406
1024
2606
1024
2727
1024
2642
1024
2789
1024
2595
1024
2495
1024
2724
1024
2583
1024
2660
1024
2675
1024
2625
1024
2677
1024
2790
1024
2606
982
2582


'OpenAI\'s original GPT model ("GPT-1") 5.3.2 GPT-2 53.3 G PT-3 5.4 Codex 5.5 GPT4 5.6 o1 5.7 Whisper 5.8 Music generation 5.9.1 MuseNet 5.1.2 Jukebox 5.2.3 ChatGPT 5.10 Stargate and other supercomputers. Firing of Altman 6.2 Content moderation contract with Sama 6.3 Lack of technological transparency 6.5 Copyright infringement in training data 6.6 GDPR compliance 6.7 Removal of military and warfare clause 6.8 Use in state-backed influence operations 6.9 Data This page includes the following links. Use this page to help people with reading comprehension and vocabulary. Use the weekly Newsquiz to test your knowledge of stories you saw on this page. The Daily Discussion offers the chance to share your feedback on articles. At the bottom of the page, please share your thoughts on the articles you saw. The term "pre-training" refers to general language training as distinct from fine-tuning for specific tasks. One petaflop/s-day is approximately equal to 10 20 neural net operations. OpenAI\

In [87]:
print(' '.join(final_summary.split()[800:900]))

hype" (https://www.theverge.com/2017/8/14/16141938/dota-2-openai-bots-elon-musk-artificial-intelligence) from the original on June 26, 2018. "#cite_ref-129) Savov, Vlad (August 14, 2017). "My favoritegame has been invasion by killerAI bots and Musk hype." (https:www. theverger.co.uk/news/articles/2018-06-25/musK-backed-bot-conquers-e-gamer-teams-in-ai-break "The International 2018: Results" (https://blog.openai.com/the-international-2018-results/). blog.openAI.com. July 18, 2018. Archived on February 13, 2019. (#cite_ref-137) Vincent, James (June 25, 2018). "AI bots trained for 180 years a day to beat humans at Dota 2" "Pro Gamers Fend off Elon Musk-Backed AI Bots—for Now" OpenAI's Dota 2 bot defeated 99.4% of players in public matches, according to Venture Beat. #cite_ref-145: Fangasadha, Edbert Felix, Soeroredjo, Steffi; Anderies; Gunawan, Alexander Agung Santoso (September 17, 2022).


In [53]:
chunks[0]

'Main menu Main page Contents Current events Random article About Wikipedia Contact us Donate Help Learn to edit Community portal Recent changes Upload file     Search Appearance Create account Log in Personal tools Create account Log in learn more Contributions Talk 1 History Toggle History subsection 1.1 2015–2018: Non-profit beginnings 1.2 2019: Transition from non-profit 1.3 2020–2023: ChatGPT, DALL-E, partnership with Microsoft 1.4 2024–present: Public/non-profit efforts, Sora, partnership with Apple 2 Participants Toggle Participants subsection 2.1 Key employees 2.2 Board of directors of the OpenAI nonprofit 2.3 Principal individual investors [ 99 ] 2.4 Corporate investors 3 Motives 4 Strategy 5 Products and applications Toggle Products and applications subsection 5.1 Reinforcement learning 5.1.1 Gym 5.1.1.1 Gym Retro 5.1.2 RoboSumo 5.1.3 OpenAI Five 5.1.4 Dactyl 5.2 API 5.3 Text generation 5.3.1 OpenAI\'s original GPT model ("GPT-1") 5.3.2 GPT-2 5.3.3 GPT-3 5.3.4 Codex 5.3.5 GPT

In [105]:
import openai
prompt = f"""
Based on the following content: {final_summary} generate 10-15 questions 
that will help readers understand the content better then provide
informative answers to these questions.
Provide these answers in the format Question: Answer
"""


response = openai.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
      {"role": "system", "content": "You are a helpful assistant who generates FAQs from website content."},
      {"role": "user", "content": prompt},

  ]
)
# print(response.choices[0].text.strip())
# response['choices'][0]['message']['content']

In [106]:
import re
answer = response.choices[0].message.content
faq_list = re.split(r"\n+", answer)
faq_list

['Question: What are some of the key models developed by OpenAI?',
 'Answer: OpenAI has developed models such as GPT-1, GPT-2, GPT-3, Codex, Whisper, Music generation models like MuseNet and Jukebox, as well as ChatGPT and Stargate supercomputers.',
 "Question: What is OpenAI's revenue situation and recent funding developments?",
 "Answer: OpenAI's losses doubled to around $540 million in the previous year, and the company has raised $124 million in funding so far. It is expected to reach $1 billion in revenue by 2024 and is in talks to secure an additional $100 million in new funding.",
 'Question: What are some notable collaborations and partnerships involving OpenAI?',
 'Answer: OpenAI has partnerships with Microsoft, including significant investments in the company. It has also announced partnerships with universities and has collaborated with Reddit, Vox, and The Atlantic. ',
 'Question: What are some controversies surrounding OpenAI?',
 'Answer: OpenAI has faced controversies suc

In [104]:
len(faq_list)

28