# Index GitHub repos using Trufflepig

In order to standardize the input data for each repository, we will be generating descriptions using Claude-3 Haiku and providing it with the description of the repo and a README if available.

## 1. Import trufflepig and other dependencies

In [None]:
!pip install trufflepig-py
!pip install anthropic
!pip install tiktoken
!pip install langdetect

In [None]:
from trufflepig import Trufflepig
import requests
import base64
import time
import anthropic
from langdetect import detect
import tiktoken

## 2. Get API Keys for the necessary services

In [None]:
TRUFFLEPIG_API_KEY = 'your-api-key'
GITHUB_TOKEN = 'your-api-key'
ANTHROPIC_KEY = 'your-api-key'

## 3. Generate better descriptions for each repo
We're also using Haiku to filter out certain repositories that are just collections of links or deprecated projects that may cause worse search results.

In [None]:
def fetch_readme(repo, headers):
    readme_files = ["README.md", "readme.md", "README", "readme", "README.rst", "readme.rst", "README_en.md", "readme_en.md", "README_EN.md"]
    encoding = tiktoken.get_encoding("cl100k_base")

    for readme_file in readme_files:
        readme_url = repo['contents_url'].replace('{+path}', readme_file)
        readme_response = requests.get(readme_url, headers=headers)
        if readme_response.status_code == 200:
            response_json = readme_response.json()
            if isinstance(response_json, dict) and 'content' in response_json:
                readme_data = response_json['content']
                if readme_data:
                    readme_text = base64.b64decode(readme_data).decode('utf-8')
                    try:
                        # Check if the README is in English or is explicitly an English README file
                        if detect(readme_text) == 'en' or 'en' in readme_file:
                            readme_tokens = encoding.encode(readme_text, disallowed_special=())
                            truncated_readme = encoding.decode(readme_tokens[:750])
                            return truncated_readme
                    except tiktoken.DisallowedSpecialTokenError as e:
                        print(f"Encoding error for {readme_file} in repository {repo['name']}: {str(e)}")
            else:
                print(f"Unexpected response format for {readme_file} in repository {repo['name']}: {response_json}")
    return ""

def generate_description(repo, readme_text, client):
    prompt = f'''
    Using the provided context, provide a concise description (no more than 3 sentences) of the repository. If the repository primarily consists of a collection of external resources and links or lacks a clear and original purpose, return "Not Substantial".

    Here are two examples:

    Example for a substantial repository:

    <example_input>
    Repository Name: DeepLearningModels
    Repository Description: A comprehensive collection of pre-trained models designed for natural language processing and computer vision.
    Readme: Includes installation instructions, usage examples, and links to research papers.
    </example_input>

    <example_output>
    The 'DeepLearningModels' repository contains wide array of advanced, ready-to-use AI models. It supports multiple languages and frameworks, and provides thorough documentation, making it accessible for both beginners and experts.
    </example_output>

    Example for a non-substantial repository:

    <example_input>
    Repository Name: UsefulLinks
    Repository Description: A curated list of links to tutorials, datasets, and tools for machine learning.
    Readme: Mainly contains hyperlinks to external websites.
    </example_input>

    <example_output>
    Not Substantial
    </example_output>

    <repository_name>{repo['name']}</repository_name>
    <repository_description>{repo['description']}</repository_description>
    <readme>{readme_text}</readme>
    '''

    response = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    print(response.content[0].text)
    return response.content[0].text

def fetch_repositories(min_stars, client, page):
    url = 'https://api.github.com/search/repositories'
    headers = {'Authorization': f'token {GITHUB_TOKEN}'}
    params = {
        'q': f'stars:>{min_stars}',
        'sort': 'stars',
        'order': 'asc',
        'per_page': 100,
        'page': page
    }
    results = []
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        repositories = response.json()['items']
        for repo in repositories:
            readme_text = fetch_readme(repo, headers)
            if readme_text and len(readme_text) > 0:
                description = generate_description(repo, readme_text, client)
                if "Not Substantial" not in description:  # Skip adding if marked not substantial
                    results.append((repo['html_url'], description, repo['stargazers_count'], repo['language'], repo['description']))
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return []
    return results

Each query to Github's search API only provided 10 pages of results so we had to do this next step a few times. We generated descriptions for over 8000 repositories with a minimum of 1000 stars.

In [None]:
anthropic_client = anthropic.Anthropic(
    api_key=ANTHROPIC_KEY,
)
results = []
for i in range (1, 11):
  temp_results = fetch_repositories(4000, anthropic_client, i)
  results.extend(temp_results)

## 4. Index repo descriptions as documents in Trufflepig

In [None]:
client = Trufflepig(api_key=TRUFFLEPIG_API_KEY)

index = client.get_index('github-search-engine')
if not index:
  index = client.create_index('github-search-engine')
for idx in range(0, len(results), 10):
  uploads = [{'document_key': repo[0], 'document': repo[1], 'metadata': {'stars': repo[2], 'language': repo[3], 'description': repo[4] }} for repo in results[idx:idx + 10]]
  tracking_ids = index.upload(uploads)
  upload_status = index.get_upload_status(tracking_ids)

  while any(status.job_status == 'IN_PROGRESS' for status in upload_status):
    status_strings = [f'{status.document_key}: {status.job_status}' for status in upload_status]
    print(status_strings)
    time.sleep(5)
    upload_status = index.get_upload_status(tracking_ids)