In [68]:
%cd '../ds-ml-challenge-szafranskifilip'

/Users/filipszafranski/Desktop/repositories/ds-ml-challenge-szafranskifilip


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [69]:
import pandas as pd
from datetime import datetime
import os

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

### Load Data

In [70]:
df_github = pd.read_csv('data/github_repos_data.csv')
df_github

Unnamed: 0,name,full_name,topics,description,created_at,updated_at,stargazers_count,forks_count,license
0,langchain,langchain-ai/langchain,no topic,🦜🔗 Build context-aware reasoning applications,2022-10-17,2024-04-07,81425,12434,MIT License
1,ChatGPT-Next-Web,ChatGPTNextWeb/ChatGPT-Next-Web,"chatgpt, cross-platform, desktop, fe, gemini, ...",A cross-platform ChatGPT/Gemini UI (Web / PWA ...,2023-03-10,2024-04-07,66548,54259,MIT License
2,awesome-machine-learning,josephmisiti/awesome-machine-learning,no topic,A curated list of awesome Machine Learning fra...,2014-07-15,2024-04-07,63304,14447,Other
3,awesome-cpp,fffaraz/awesome-cpp,"awesome, awesome-list, c, c-plus-plus, cpp, cp...",A curated list of awesome C++ (or C) framework...,2014-07-17,2024-04-07,55101,7613,MIT License
4,llama.cpp,ggerganov/llama.cpp,"ggml, llama",LLM inference in C/C++,2023-03-10,2024-04-07,54581,7703,MIT License
...,...,...,...,...,...,...,...,...,...
512,awesome-transformer-nlp,cedrickchee/awesome-transformer-nlp,"attention-mechanism, awesome, awesome-list, be...",A curated list of NLP resources focused on Tra...,2019-01-13,2024-04-01,1022,122,MIT License
513,awesome_LLMs_interview_notes,jackaduma/awesome_LLMs_interview_notes,"ai, interview-practice, interview-preparation,...",LLMs interview notes and answers:该仓库主要记录大模型（LL...,2023-09-21,2024-04-07,1019,161,MIT License
514,chameleon-llm,lupantech/chameleon-llm,"ai, chatgpt, gpt-4, llm, openai, python, tool","Codes for ""Chameleon: Plug-and-Play Compositio...",2023-04-19,2024-04-02,1009,84,Apache License 2.0
515,bramses-highly-opinionated-vault-2023,bramses/bramses-highly-opinionated-vault-2023,obsidian-md,"A highly opinionated, fully featured Obsidian ...",2022-12-08,2024-04-03,1009,49,MIT License


### Feature Engineering

1. Create a column 'days_since_created' which states the age of a repository

In [71]:
# Convert 'created_at' column to datetime
df_github['created_at'] = pd.to_datetime(df_github['created_at'])

# Calculate the number of days from 'created_at' to today
# Note: datetime.now() gives the current date and time; datetime.today() gives the current date with the time set to midnight.
df_github['days_since_created'] = (datetime.now() - df_github['created_at']).dt.days

2. Create a column 'forks/day' which states the ratio of forks_count and days_since_created

In [72]:
df_github['avg_forks_per_day'] = round(df_github['forks_count']/df_github['days_since_created'], 0)
df_github['avg_stars_per_day'] = round(df_github['stargazers_count']/df_github['days_since_created'], 0)
df_github.drop(columns=['updated_at'], inplace=True)
df_github.head()

Unnamed: 0,name,full_name,topics,description,created_at,stargazers_count,forks_count,license,days_since_created,avg_forks_per_day,avg_stars_per_day
0,langchain,langchain-ai/langchain,no topic,🦜🔗 Build context-aware reasoning applications,2022-10-17,81425,12434,MIT License,540,23.0,151.0
1,ChatGPT-Next-Web,ChatGPTNextWeb/ChatGPT-Next-Web,"chatgpt, cross-platform, desktop, fe, gemini, ...",A cross-platform ChatGPT/Gemini UI (Web / PWA ...,2023-03-10,66548,54259,MIT License,396,137.0,168.0
2,awesome-machine-learning,josephmisiti/awesome-machine-learning,no topic,A curated list of awesome Machine Learning fra...,2014-07-15,63304,14447,Other,3556,4.0,18.0
3,awesome-cpp,fffaraz/awesome-cpp,"awesome, awesome-list, c, c-plus-plus, cpp, cp...",A curated list of awesome C++ (or C) framework...,2014-07-17,55101,7613,MIT License,3554,2.0,16.0
4,llama.cpp,ggerganov/llama.cpp,"ggml, llama",LLM inference in C/C++,2023-03-10,54581,7703,MIT License,396,19.0,138.0


3. Use llm (OpenaAI) and Langchain to assign categories to each repository (Tagging)

In [73]:
# Load OpeanAI api key form the environment variables
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

def assign_category(inp: str) -> str:

    if inp is None or pd.isna(inp):
        return "Unknown"
    
    tagging_prompt = ChatPromptTemplate.from_template(
        """
    Extract the desired information from the following passage.

    Only extract the properties mentioned in the 'Classification' function.

    Passage:
    {input}
    """
    )

    class Classification(BaseModel):
        category: str = Field(description="The category of the text.", 
                              enum=['Tutorials', 'Applications', 'AI engineering', 'Model development', 'Model repo', 'Infrastructure', 'Lists'])


    # LLM
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125").with_structured_output(
        Classification
    )

    tagging_chain = tagging_prompt | llm
    results = tagging_chain.invoke({"input": inp})
    return results.category
    

In [76]:
df_github['categories'] = df_github['description'].apply(assign_category)

In [81]:
df_github

Unnamed: 0,name,full_name,topics,description,created_at,stargazers_count,forks_count,license,days_since_created,avg_forks_per_day,avg_stars_per_day,categories
0,langchain,langchain-ai/langchain,no topic,🦜🔗 Build context-aware reasoning applications,2022-10-17,81425,12434,MIT License,540,23.0,151.0,Applications
1,ChatGPT-Next-Web,ChatGPTNextWeb/ChatGPT-Next-Web,"chatgpt, cross-platform, desktop, fe, gemini, ...",A cross-platform ChatGPT/Gemini UI (Web / PWA ...,2023-03-10,66548,54259,MIT License,396,137.0,168.0,Applications
2,awesome-machine-learning,josephmisiti/awesome-machine-learning,no topic,A curated list of awesome Machine Learning fra...,2014-07-15,63304,14447,Other,3556,4.0,18.0,Lists
3,awesome-cpp,fffaraz/awesome-cpp,"awesome, awesome-list, c, c-plus-plus, cpp, cp...",A curated list of awesome C++ (or C) framework...,2014-07-17,55101,7613,MIT License,3554,2.0,16.0,Lists
4,llama.cpp,ggerganov/llama.cpp,"ggml, llama",LLM inference in C/C++,2023-03-10,54581,7703,MIT License,396,19.0,138.0,Tutorials
...,...,...,...,...,...,...,...,...,...,...,...,...
512,awesome-transformer-nlp,cedrickchee/awesome-transformer-nlp,"attention-mechanism, awesome, awesome-list, be...",A curated list of NLP resources focused on Tra...,2019-01-13,1022,122,MIT License,1913,0.0,1.0,Lists
513,awesome_LLMs_interview_notes,jackaduma/awesome_LLMs_interview_notes,"ai, interview-practice, interview-preparation,...",LLMs interview notes and answers:该仓库主要记录大模型（LL...,2023-09-21,1019,161,MIT License,201,1.0,5.0,Model repo
514,chameleon-llm,lupantech/chameleon-llm,"ai, chatgpt, gpt-4, llm, openai, python, tool","Codes for ""Chameleon: Plug-and-Play Compositio...",2023-04-19,1009,84,Apache License 2.0,356,0.0,3.0,Model repo
515,bramses-highly-opinionated-vault-2023,bramses/bramses-highly-opinionated-vault-2023,obsidian-md,"A highly opinionated, fully featured Obsidian ...",2022-12-08,1009,49,MIT License,488,0.0,2.0,Model repo


In [83]:
# Save the DataFrame as a CSV file
df_github.to_csv('data/github_final.csv', index=False)