# Repos Purposes

Utiliza a API da OpenAI para analisar os repositórios identificados, buscando identificar seu propósito.

In [1]:
import re
import os
import json
import glob
import base64

from openai import OpenAI
import pandas as pd
from dotenv import dotenv_values
from ghapi.core import GhApi, HTTP404NotFoundError

import utils

In [2]:
config = {
  **os.environ,
  **dotenv_values(".env")
}

github = GhApi(token=config['GITHUB_TOKEN'])

openai_client = OpenAI(
  api_key=config['OPENAI_KEY']
)

In [None]:
def get_repo_details(owner, repo):
  while github.rate_limit.get()['resources']['core']['remaining'] <= 0:
    print("get_repo_details waiting for limit")
    time.sleep(1)
  return github.repos.get(owner, repo)

def get_readme_snippet(owner, repo, max_chars=500):
  while github.rate_limit.get()['resources']['core']['remaining'] <= 0:
    print("get_readme_snippet waiting for limit")
    time.sleep(1)
  try:
    response = github.repos.get_readme(owner, repo)
    return base64.b64decode(response['content']).decode('utf-8', errors='ignore')[:max_chars]
  except HTTP404NotFoundError:
    return ""

def get_repo_insights(repo_details, readme_snippet, prompt):
  repo_summary = f"""
  Repo Name: {repo_details['name']}
  Owner: {repo_details['owner']['login']} (Type: {repo_details['owner']['type']})
  Description: {repo_details['description']}
  Primary Language: {repo_details['language']}
  Stars: {repo_details['stargazers_count']}
  License: {repo_details['license']['name'] if repo_details['license'] else 'None'}
  Fork: {repo_details['fork']}
  Created At: {repo_details['created_at']}
  Readme Snippet: {readme_snippet}
  """

  messages = [
    {
      "role": "system",
      "content": prompt
    },
    {
      "role": "user",
      "content": repo_summary
    }
  ]

  response = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0.0
  )

  return response.choices[0].message.content

def add_insights(line, prompt):
  owner, repo = re.split(r'/', line['repo_name'])
  print(f"{owner}/{repo}")

  details = get_repo_details(owner, repo)
  readme = get_readme_snippet(owner, repo)
  
  insights = get_repo_insights(details, readme, prompt)
  return pd.Series(json.loads(insights))

## Carrega as pesquisas já salvas

In [9]:
df_search = utils.load_results()

print(f"Encontrados {df_search.shape[0]} arquivos em {len(df_search['repo_name'].unique())} repositórios")

Encontrados 3179 arquivos em 2579 repositórios


## Explorando uma amostra dos repositórios

O objetivo é consumir apenas um pouco das APIs e estabelecer algumas categorias de repositório para posterior categorização completa.

In [21]:
df_explore = df_search.sample(5)

In [27]:

explore_prompt = """Generate a brief summary of a GitHub repository's purpose and assign a one-word category to classify its objective. Focus on achieving consistency by clearly identifying the intended purpose.

# Additional details

Identify the author's objective with their project and classify it using an appropriate open-ended category. Avoid vague categories that do not clearly define the project’s purpose.

# Output Format

Output as a JSON object with the following fields:
- purpose: (brief text summary)
- category: (one-word category that best describes the project) 

# Examples [optional]

If needed: 
- Input: [Repository details or brief description]
- Output: { "purpose": "Brief summary of project", "category": "OpenSource" }

# Notes

Ensure that the chosen category clearly and accurately reflects the project's purpose and intent."""

df_explore[['purpose', 'category']] = df_explore.apply(add_insights, axis='columns', args=(explore_prompt, ))

PacktPublishing/The-Kubernetes-Bible-Second-Edition
get_repo_insights asking for insights
get_repo_insights returning insights
passmarked/vnu
get_repo_insights asking for insights
get_repo_insights returning insights
willbossle/LINUXtips-PICK
get_repo_insights asking for insights
get_repo_insights returning insights
corezoid/helm
get_repo_insights asking for insights
get_repo_insights returning insights
kyw613/QnABoard-k8s
get_repo_insights asking for insights
get_repo_insights returning insights


In [28]:
df_explore

Unnamed: 0,repo_name,file_path,url,purpose,category
986,PacktPublishing/The-Kubernetes-Bible-Second-Ed...,Chapter20/hpa/todo-hpa.yaml,https://github.com/PacktPublishing/The-Kuberne...,"A comprehensive guide to Kubernetes, focusing ...",Educational
82,passmarked/vnu,kube.yaml,https://github.com/passmarked/vnu/blob/38e1e9d...,Service running a static build of W3C validato...,Validation
980,willbossle/LINUXtips-PICK,giropops-senhas-chart/templates/hpa.yaml,https://github.com/willbossle/LINUXtips-PICK/b...,A challenge project related to LINUXtips-PICK ...,Educational
495,corezoid/helm,corezoid/charts/conf-agent-server/templates/co...,https://github.com/corezoid/helm/blob/6b79af7c...,Corezoid Helm Charts for managing applications...,DevOps
410,kyw613/QnABoard-k8s,final/front-hpa.yaml,https://github.com/kyw613/QnABoard-k8s/blob/2c...,QnABoard-k8s is a Kubernetes-based question an...,Q&A
