In [1]:
#!pip install openai

In [61]:
from configparser import ConfigParser
import datetime
import requests
import openai
import json
import os

In [3]:
# Get API key stored in local cfg file
# How to is available here : https://towardsdatascience.com/keeping-credentials-safe-in-jupyter-notebooks-fbd215a8e311

parser = ConfigParser()
_ = parser.read('ObsidianGPT.cfg')
openai.api_key = parser.get('my_api', 'auth_key')

# Function definition

In [4]:
def generate_text(prompt, GPTmodel):
    response = openai.ChatCompletion.create(model=GPTmodel,messages=[{"role": "user", "content": prompt}])
    return response.choices[0].message.content.strip()

In [5]:
def create_obsidian_vault(vault_name, parent_directory):
    '''Creates or Updates and Obsidian vault, in a given directory. Params are vault_name, parent_directory'''
    vault_path = os.path.join(parent_directory, vault_name)
    
    # Create the vault directory
    os.makedirs(vault_path, exist_ok=True)

    # Create default folders within the vault
    os.makedirs(os.path.join(vault_path, "attachments"), exist_ok=True)
    os.makedirs(os.path.join(vault_path, "notes"), exist_ok=True)
    
    return vault_path

# Create and/or modify a local obsidian vault

In [6]:
vault_path = create_obsidian_vault(vault_name="MyNewVault", parent_directory=os.getcwd())
vault_path

'/Users/WDescamps/Desktop/code_projects/side_projects/ObsidianGPT/MyNewVault'

## Prompt created by GPT, asking him to create a prompt to match my input of models

In [8]:
# Get the list of models for different kinds of data from the GPT API
models_prompt = """
Provide a list of popular machine learning and deep learning models for different types of data, 
grouped by data type and problem type. Include Numerical Data with Regression and Classification models, 
Categorical Data with Classification models, Text Data with Natural Language Processing models, 
Image Data with Computer Vision models and Image Generation models, Time Series Data with Forecasting models, 
Audio Data with Speech Recognition and Music Generation models, Reinforcement Learning with Model-free 
and Model-based Algorithms, and Unstructured Data with Clustering and Dimensionality Reduction models.
"""

In [9]:
%%time
# Get the list of models, for each data type and problem type
models_text = generate_text(models_prompt, GPTmodel="gpt-4") #or gpt-3.5-turbo
models_text

In [30]:
models_text

'1. Numerical Data\n   - Regression Models\n     - Linear Regression\n     - Lasso Regression\n     - Ridge Regression\n     - Elastic Net\n     - Support Vector Regression (SVR)\n     - Decision Tree Regression\n     - Random Forest Regression\n     - AdaBoost Regression\n     - Gradient Boosting Regression\n     - XGBoost\n     - LightGBM\n     - CatBoost\n   \n   - Classification Models\n     - Logistic Regression\n     - Support Vector Machines (SVM)\n     - Decision Trees\n     - Random Forests\n     - Naive Bayes\n     - k-Nearest Neighbors (k-NN)\n     - AdaBoost\n     - Gradient Boosting Machines (GBM)\n     - XGBoost\n     - LightGBM\n     - CatBoost\n\n2. Categorical Data\n   - Classification Models\n     - Same as numerical classification models, as categorical data can be processed by encoding it into numerical representations.\n     - Categorical Naive Bayes\n     - Categorical Neural Networks (embedding layers)\n\n3. Text Data\n   - Natural Language Processing Models\n   

## Clean model list

In [58]:
output = models_text

lines = output.split("\n")
models = []

current_data_type = ""
current_model_type = ""

for line in lines:
    if line == "":
        continue
    
    if "Data" in line :
        current_data_type = line.strip().split(".")[1]
        continue
    
    if "Models" in line:
        current_model_type = line.strip().split("-")[1]
        continue
    
    if "-" in line :
        model_name = line.strip().split("-")[1]
        models.append({"name": model_name, "model_type": current_model_type, "data_type": current_data_type})

# Print the parsed models
for model in models:
    print(model)


{'name': ' Linear Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Lasso Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Ridge Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Elastic Net', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Support Vector Regression (SVR)', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Decision Tree Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Random Forest Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' AdaBoost Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' Gradient Boosting Regression', 'model_type': ' Regression Models', 'data_type': ' Numerical Data'}
{'name': ' XGBoost', 'model_type': ' Regression Models', 'data_type': ' Nume

In [71]:
#save models dict to disk with current date
with open(f'models_{datetime.datetime.now().date()}.txt', 'w') as convert_file:
     convert_file.write(json.dumps(models))

In [72]:
models[0:4]

[{'name': ' Linear Regression',
  'model_type': ' Regression Models',
  'data_type': ' Numerical Data'},
 {'name': ' Lasso Regression',
  'model_type': ' Regression Models',
  'data_type': ' Numerical Data'},
 {'name': ' Ridge Regression',
  'model_type': ' Regression Models',
  'data_type': ' Numerical Data'},
 {'name': ' Elastic Net',
  'model_type': ' Regression Models',
  'data_type': ' Numerical Data'}]

## Add data to existing models

In [None]:
from tqdm import tqdm
for i in tqdm(range(10000)):

In [73]:
%%time
models_with_details=models
for model in models_with_details:
    resource_prompt = f"""
    For the {model['name']} model, provide:
    1. A brief description of the model.
    2. The three most relevant use cases.
    3. Three great resources with relevant internet links for implementing the model.
    4. A python code which demonstrates the use of this model 
    """
    resources = generate_text(resource_prompt, GPTmodel="gpt-4")
    model["resources"] = resources

CPU times: user 2.23 s, sys: 391 ms, total: 2.63 s
Wall time: 3h 5min 17s


In [74]:
#save enriched models dict to disk with current date
with open(f'models_with_details_{datetime.datetime.now().date()}.txt', 'w') as convert_file:
     convert_file.write(json.dumps(models_with_details))

In [75]:
%%time
for model in models_with_details:
    output = model["resources"]

    parsed_output = {}
    keys = ["description", "use_cases", "resources", "python_code"]

    lines = output.split("\n")
    current_key = ""

    for line in lines:
        if line.startswith("1."):
            current_key = keys[0]
            parsed_output[current_key] = line.split("1. ")[1].strip()
        elif line.startswith("2."):
            current_key = keys[1]
            parsed_output[current_key] = []
        elif line.startswith("3."):
            current_key = keys[2]
            parsed_output[current_key] = []
        elif line.startswith("4."):
            current_key = keys[3]
            parsed_output[current_key] = ""
        else:
            if current_key == keys[1] or current_key == keys[2]:
                content = line.strip()
                if content:
                    parsed_output[current_key].append(content)
            elif current_key == keys[3]:
                parsed_output[current_key] +=  line + "\n"

    # Print the parsed output
    for key, value in parsed_output.items():
        model[key]=value


CPU times: user 7.38 ms, sys: 669 µs, total: 8.05 ms
Wall time: 8.45 ms


## Write to Obsidian Vault

In [78]:
# Iterate through the models list
for model in models_with_details:
    model_filename = f"{model['name']}.md"
    model_filepath = os.path.join(vault_path, model_filename)
    data_type = model['data_type']
    model_name = model['name']

    # Create the directory for the data type if it does not exist
    data_type_path = os.path.join(vault_path, data_type)
    if not os.path.exists(data_type_path):
        os.makedirs(data_type_path)

    # Create a file for the model and write its information
    file_name = f"{model_name}.md"
    file_path = os.path.join(data_type_path, file_name)
    with open(file_path, "w") as f:
        f.write(f"# {model_name}\n")
        f.write(f"**Model Type:** {model['model_type']}\n")
        f.write(f"**Data Type:** {model['data_type']}\n\n")
        
        #f.write(f"**Description**:\n\n{model['description']}\n\n")
        
        f.write(f"**Python code **:\n\n{model['python_code']}\n\n")

        # Add "See Also" section with links to related models
        f.write(f"**See Also**:\n\n")
        for other_model in models:
            if other_model['name'] != model['name'] and other_model['data_type'] == model['data_type']:
                f.write(f"- [[{other_model['name']}]]\n")

        # Add additional information about the model if available
        if 'resources' in model:
            f.write("## Resources\n\n")
            for resource in model['resources']:
                f.write(resource +"\n")
            f.write("\n")

        # Add relevant tags with hierarchy, strip special chars for clarity
        f.write(f"\n---\n")
        root_tag = model['data_type'].replace(' ', '-').lower()
        leaf_tag = model['name'].replace('(', '').replace(')', '').replace(' ', '-').lower()
        f.write(f"tags: #{root_tag}, #{root_tag}/{leaf_tag}\n")
