In [1]:
#!pip install openai

In [2]:
# ChatGPT API Config
from configparser import ConfigParser

# Graph Viz
import matplotlib.pyplot as plt
import networkx as nx

from tqdm import tqdm
import datetime
import requests
import openai
import json
import os
import re


In [3]:
# Get API key stored in local cfg file
# How to is available here : https://towardsdatascience.com/keeping-credentials-safe-in-jupyter-notebooks-fbd215a8e311

parser = ConfigParser()
_ = parser.read('ObsidianGPT.cfg')
openai.api_key = parser.get('my_api', 'auth_key')

In [28]:
#Shows available OPENAI models
for i in openai.Model.list()["data"]:
    print(i["id"])

davinci
gpt-3.5-turbo-16k-0613
text-davinci-001
text-search-curie-query-001
gpt-4
babbage
text-babbage-001
gpt-3.5-turbo-16k
curie-instruct-beta
gpt-4-0314
davinci-similarity
code-davinci-edit-001
text-similarity-curie-001
gpt-4-0613
ada-code-search-text
text-search-ada-query-001
babbage-search-query
ada-similarity
text-curie-001
text-search-ada-doc-001
text-search-babbage-query-001
code-search-ada-code-001
curie-search-document
davinci-002
gpt-3.5-turbo-0613
text-search-davinci-query-001
text-search-curie-doc-001
babbage-search-document
babbage-002
babbage-code-search-text
text-embedding-ada-002
gpt-3.5-turbo
davinci-instruct-beta
davinci-search-query
text-similarity-babbage-001
text-davinci-002
code-search-babbage-text-001
text-search-davinci-doc-001
code-search-ada-text-001
ada-search-query
text-similarity-ada-001
ada-code-search-code
whisper-1
text-davinci-edit-001
davinci-search-document
curie-search-query
babbage-similarity
ada
ada-search-document
text-ada-001
text-similarity-dav

# Functions definition

In [4]:
# Passes a specific prompt to the chosen OpenAI GPT model
def generate_text(prompt, GPTmodel):
    response = openai.ChatCompletion.create(model=GPTmodel,messages=[{"role": "user", "content": prompt}])
    return response.choices[0].message.content.strip()

In [25]:
def add_spaces_to_camelcase(text_input):
    text_output=re.sub(r"(\w)([A-Z])", r"\1 \2", text_input)
    return(text_output)

In [5]:
# Creates a local obsidian vault
def create_obsidian_vault(vault_name, parent_directory):
    '''Creates or Updates and Obsidian vault, in a given directory. Params are vault_name, parent_directory'''
    vault_path = os.path.join(parent_directory, vault_name)
    
    # Create the vault directory
    os.makedirs(vault_path, exist_ok=True)

    # Create default folders within the vault
    os.makedirs(os.path.join(vault_path, "attachments"), exist_ok=True)
    os.makedirs(os.path.join(vault_path, "notes"), exist_ok=True)
    
    return vault_path

In [6]:
# Extract strings with regexp
def regex_string_finder(input_string, pattern):
    '''
    Usage: regex_string_finder("This is a sample.text:with:colons", r'\.(.*?)\:')
    '''
    # Use re.search to find the first match of the pattern
    match = re.search(pattern, input_string)

    # Check if a match is found
    if match:
        # Extract the substring between '.' and ':'
        extracted_string = match.group(1)
        return extracted_string.strip()
    else:
        return "No match found."

In [7]:
def write_variable_to_text_file(variable, file_path):
    # Open the file in write mode ('w')
    with open(file_path, 'w') as file:
        # Write the variable's value to the file
        file.write(variable)
    
    print(f"Variable written to {file_path}")

In [8]:
def visualize_json_as_graph(data_dict):
    # Create a directed graph
    G = nx.DiGraph()

    # Iterate through the JSON data and add nodes and edges to the graph
    for category, subcategories in data_dict.items():
        G.add_node(category, type="category")
        for subcategory, algorithms in subcategories.items():
            G.add_node(subcategory, type="subcategory")
            G.add_edge(category, subcategory)
            for algorithm in algorithms:
                G.add_node(algorithm, type="algorithm")
                G.add_edge(subcategory, algorithm)

    # Define node colors based on types
    node_colors = {
        "category": "lightblue",
        "subcategory": "lightgreen",
        "algorithm": "lightcoral",
    }
    for node in G.nodes():
        if "type" not in G.nodes[node]:
            G.nodes[node]["type"] = "default"

    colors = [node_colors[G.nodes[node]["type"]] for node in G.nodes()]

    # Create the graph visualization
    pos = nx.spring_layout(G, seed=42)  # Position nodes using spring layout
    nx.draw(G, pos, node_color=colors, with_labels=True, font_size=8)
    plt.title("JSON Data Visualization as a Graph")
    plt.show()

## GPT4 Base Prompt

In [10]:
# Quite a bit of trial and error prompting led to the following prompt. Feel free to experiment with it!
# Get the list of models for different kinds of data from the GPT API
models_prompt = """
Provide an exhaustive list of popular machine learning and deep learning models for all types of data, 
grouped by data type and problem type.Provide as your response the dictionnary containing the results. 
"""

In [None]:
%%time
# Get the list of models, for each data type and problem type
models_text = generate_text(models_prompt, GPTmodel="gpt-4-0314") #Points to the latest GPT4 model or gpt-3.5-turbo for faster results vs lower query quality

In [None]:
original_string = output
modified_string = re.sub(r'[ \n]', '', original_string) #GPT is a conversational model and adds \n and spaces, we remove them here to allow for dict to JSON conversion

# Convert the string to a Python dictionary
data_dict = json.loads(modified_string)

# Now you can work with the data_dict as a Python dictionary
print(data_dict)

## Save base JSON to disk, as txt file

In [None]:
#save models dict to disk with current date
with open(f'models_v1_3_{today.date()}.txt', 'w') as convert_file:
     convert_file.write(json.dumps(data_dict))

## Let's visualise what we have

In [None]:
# Updates since v1 : 
# On the 2023-09-06 : Linkedin, Medium, TowardsDataScience links are almost all broken. It's not worth asking GPT4
# On the 2023-09-06 : Model completion has been switched to gpt3.5 turbo since cheaper and faster

In [None]:
visualize_json_as_graph(data_dict)

Not the clearest viz, let's try an interactive one in a separate Notebook using Dash

## Let's add details for each of our models

In [11]:
# Load file from disk if necessary
filename = f'models_v1_3_2023-09-07.txt'
with open(filename, 'r') as read_file:
    data_dict = json.load(read_file)

In [49]:
print(data_dict.keys())

dict_keys(['StructuredData', 'TimeSeriesData', 'TextData', 'ImageData', 'AudioData', 'GraphData'])


In [50]:
# Check if loop structure is valid
for data_type in data_dict.keys():
    print(f"*** DATA TYPE : {data_type} ***")
    for subcategory in data_dict[data_type]:
        print(f"*** SUBCATEGORY : {subcategory} ***")
        for algorithm in data_dict[data_type][subcategory]:
            print(f"*** ALGORITHM : {algorithm} ***")

*** DATA TYPE : StructuredData ***
*** SUBCATEGORY : Regression ***
*** ALGORITHM : LinearRegression ***
*** ALGORITHM : RidgeRegression ***
*** ALGORITHM : LassoRegression ***
*** ALGORITHM : ElasticNet ***
*** ALGORITHM : SupportVectorRegression ***
*** ALGORITHM : DecisionTreeRegression ***
*** ALGORITHM : RandomForestRegression ***
*** ALGORITHM : AdaBoostRegression ***
*** ALGORITHM : GradientBoostingRegression ***
*** ALGORITHM : XGBoost ***
*** ALGORITHM : LightGBM ***
*** ALGORITHM : CatBoost ***
*** ALGORITHM : ArtificialNeuralNetworks ***
*** ALGORITHM : LongShort-TermMemory ***
*** SUBCATEGORY : Classification ***
*** ALGORITHM : LogisticRegression ***
*** ALGORITHM : LinearDiscriminantAnalysis ***
*** ALGORITHM : QuadraticDiscriminantAnalysis ***
*** ALGORITHM : SupportVectorMachines ***
*** ALGORITHM : DecisionTreeClassifier ***
*** ALGORITHM : RandomForestClassifier ***
*** ALGORITHM : AdaBoostClassifier ***
*** ALGORITHM : GradientBoostingClassifier ***
*** ALGORITHM : X

# Prompt GPT and write to Obsidian Vault

In [41]:
# Create Obsidian Vault

today=datetime.datetime.now()
vault_path = create_obsidian_vault(vault_name=f"Vault_v1_3"+ "-" + str(today.date()), parent_directory=os.getcwd())
vault_path

'/Users/WDescamps/Desktop/code_projects/side_projects/ObsidianGPT/Vault_v1_3-2023-09-12'

In [48]:
# Prompting and writing
for data_type in data_dict.keys():
    
    # Create the directory for the data type if it does not exist
    data_type_path = os.path.join(vault_path, data_type)
    if not os.path.exists(data_type_path):
        os.makedirs(data_type_path)
        
    for subcategory in tqdm(data_dict[data_type], desc="outer", position=0):
        
        # Create the directory for the subcategory if it does not exist
        subcategory_path = os.path.join(vault_path, data_type, subcategory)
        if not os.path.exists(subcategory_path):
                os.makedirs(subcategory_path)
        
        # Loop on each algorithm in given subcategory
        for algorithm in tqdm(data_dict[data_type][subcategory], desc="inner", position=1):
            
            #Create the GPT prompt
            resource_prompt = f"""
            For the {add_spaces_to_camelcase(algorithm)} model with {add_spaces_to_camelcase(data_type)} regarding {add_spaces_to_camelcase(subcategory)}, provide:
        1. A short description of the model.
        2. A list of the pros and cons of the model.
        3. The three most relevant use cases.
        4. Three great resources with relevant internet links for implementing the model.
        5. The top 5 people with the most expertise relative to this model, with a link to their github page
        
        Format your response using Obsidian Flavored Markdown, and add internal links and tags when relevant
            """
            # GPT API call
            resources = generate_text(resource_prompt, GPTmodel="gpt-3.5-turbo")
            
            # Create a .md file and path for the given model 
            file_name = f"{algorithm}.md"
            file_path = os.path.join(subcategory_path, file_name)
            
            #Write prompt results to .md file
            with open(file_path, "w") as f:
                f.write(resources)
                
                # Add relevant tags with hierarchy. Strip special chars for clarity
                f.write(f"\n\n\n ### Relevant Internal Links\n")
                f.write(f"- Data Type : [[{data_type}]]\n")
                f.write(f"- Problem type : [[{subcategory}]]\n")

            # If debugging, uncomment line below
            #print(resources)

outer:   0%|                                                             | 0/6 [00:00<?, ?it/s]
inner:   0%|                                                             | 0/9 [00:00<?, ?it/s][A
inner:  11%|█████▉                                               | 1/9 [00:44<05:56, 44.52s/it][A
inner:  22%|███████████▊                                         | 2/9 [01:30<05:19, 45.59s/it][A
inner:  33%|█████████████████▋                                   | 3/9 [02:00<03:50, 38.44s/it][A
inner:  44%|███████████████████████▌                             | 4/9 [02:44<03:23, 40.61s/it][A
inner:  56%|█████████████████████████████▍                       | 5/9 [03:21<02:36, 39.13s/it][A
inner:  67%|███████████████████████████████████▎                 | 6/9 [03:56<01:53, 37.67s/it][A
inner:  78%|█████████████████████████████████████████▏           | 7/9 [04:30<01:13, 36.74s/it][A
inner:  89%|███████████████████████████████████████████████      | 8/9 [05:08<00:37, 37.01s/it][A
inner: 100%|█

inner:  67%|███████████████████████████████████▎                 | 6/9 [03:19<01:43, 34.48s/it][A
inner:  78%|█████████████████████████████████████████▏           | 7/9 [03:50<01:06, 33.38s/it][A
inner:  89%|███████████████████████████████████████████████      | 8/9 [04:24<00:33, 33.48s/it][A
inner: 100%|█████████████████████████████████████████████████████| 9/9 [04:55<00:00, 32.83s/it][A
outer:  60%|███████████████████████████████▏                    | 3/5 [15:59<10:15, 307.51s/it]
inner:   0%|                                                             | 0/7 [00:00<?, ?it/s][A
inner:  14%|███████▌                                             | 1/7 [00:26<02:38, 26.42s/it][A
inner:  29%|███████████████▏                                     | 2/7 [01:04<02:44, 33.00s/it][A
inner:  43%|██████████████████████▋                              | 3/7 [01:46<02:28, 37.14s/it][A
inner:  57%|██████████████████████████████▎                      | 4/7 [02:12<01:39, 33.04s/it][A
inner:  71%|█

In [None]:
#Check if contents of each categ are similar and i'm not missing categs.

In [None]:
data_dict