In [3]:
fetchapi_url = "https://api.github.com/repos/{owner}/{repo}/contents/{path}".format(**{
  "owner": "mikepsinn",
  "repo": "FDAi",
  "path":""
})
fetchapi_url

'https://api.github.com/repos/mikepsinn/FDAi/contents/'

In [20]:
import os, requests, re

from dotenv import load_dotenv  
load_dotenv('../.env', override=True)
print(os.environ['OPENAI_API_KEY'][-4:])

pzUy


In [17]:
from fastapi import APIRouter, HTTPException
from llama_index.core import VectorStoreIndex, SummaryIndex

from llama_index.readers.github import GithubRepositoryReader
from llama_index.readers.web import SimpleWebPageReader

class web_agent:
  def __init__(self, url):
    self.initialized = False
    github_urls = self.fetch_github_repo_contents_urls(url)
    print(f"Extracted {len(github_urls)} in total from repo")
    github_urls = self.filter_urls(
      github_urls, startswith="README")
    print(f"Filtered to {len(github_urls)} urls")
    self.document_store = self._init_document_store(github_urls)
    self.query_engine = self._init_query_engine(self.document_store)
    
  def _init_document_store(self, urls:list):
    documents = SimpleWebPageReader(html_to_text=True).load_data(
      urls
    )
    return documents
  
  def _init_query_engine(self, document_store):
    index = SummaryIndex.from_documents(document_store)
    query_engine = index.as_query_engine(streaming=True)
    return query_engine
  
  def query(self, query):
    return self.query_engine.query(query)
  
  def filter_urls(
    self, urls, regex:str=None, limit:int=None):
    if regex: urls=[
      l for l in urls if l.lower.startswith(startswith)]
    if limit: urls=urls[:min(len(urls), limit)]
    return urls
    
  def fetch_github_repo_contents_urls(
    self, api_url,path=""):
    headers = {
        "Authorization": f"token {os.environ['GITHUB_TOKEN']}",
        "Accept": "application/vnd.github.v3+json"
    }
    response = requests.get(
        api_url+path, headers=headers
    )
    urls = []
        # Check if the request was successful
    if response.status_code == 200:
        contents = response.json()
        # Ensure contents is a list before iterating
        if not isinstance(contents, list):
            return urls

        for n, content in enumerate(contents):
            # print(f"Filename:{content.get('name','')}")
            # If it's a directory, recurse into it
            if content.get('type') == 'dir':
                urls.extend(self.fetch_github_repo_contents_urls(api_url+path, content.get('path')))
            else:
                # Append the file URL to the list
                urls.append(content.get('html_url', 'No URL found'))
        print(f"Extracted: {n} urls here")
    # else:
    #     # Print out the error message from GitHub API
    #     print(f"Error fetching contents: {response.status_code} - {response.json().get('message')}")
    return urls

### Test

In [19]:
query_agent = web_agent(fetchapi_url)

Extracted: 1 urls here
Extracted: 0 urls here
Extracted: 3 urls here
Extracted: 1 urls here
Extracted: 2 urls here
Extracted: 3 urls here
Extracted: 0 urls here
Extracted: 8 urls here
Extracted: 20 urls here
Extracted: 10 urls here
Extracted: 0 urls here
Extracted: 7 urls here
Extracted: 1 urls here
Extracted: 45 urls here
Extracted 74 in total from repo
Filtered to 0 urls


In [None]:
query_agent.query("What is the FDAi repo about?")

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-obMKcYHtIlZvQNYmsdbMaMyO on tokens per min (TPM): Limit 60000, Used 59536, Requested 2822. Please try again in 2.358s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

## Experiment Approaches

In [None]:
# Require the following
print(type(query_agent.document_store))


<class 'list'>


In [None]:
query_agent.query_engine.query('What is FDAi project about?')

In [8]:
from langchain.chat_models import ChatOpenAI

from langchain.agents import Tool
from langchain.agents import initialize_agent
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

# Define tools

tools = [
    Tool(
       name = "LlamaIndex",
        func=lambda q: str(query_agent.query_engine.query(q)),
        description="context specific query engine",
        return_direct=True
    ),
]
#Initialize conversational memory
conversational_memory = ConversationBufferWindowMemory( memory_key='chat_history', k=5, return_messages=True )
# Initialize agent with conversational memory
agent_executor = initialize_agent(
    tools, llm=ChatOpenAI(
        temperature=0.7, model_name='gpt-4'), agent="conversational-react-description", memory=conversational_memory)

# if I use my index
prompt = input("type prompt")
# What is the FDAi GitHub repoisitory about?
if prompt == 'thanks':
    print(f' ---------- Agent: chat is closed -------')
else:
    response = agent_executor.run(input=prompt)
    print(f'me : {prompt}')
    print(f'Agent : {response}')

  warn_deprecated(


me : What is the FDAi GitHub repoisitory about?
Agent : The FDA, or Food and Drug Administration, does not have an officially recognized GitHub repository as of my training data up to September 2021. If you're referring to a specific repository by a different name or acronym, could you please provide more details? In general, GitHub repositories are used to host and share code for various projects, so an "FDAi" repository might potentially be related to projects or initiatives associated with the FDA. However, without more specific details, it's hard to provide an accurate description.
