In [None]:
# Date: 04.03.25
# Purpose: The objective is to go through all of my notebooks and extract overview

In [7]:
#pip install nbformat

In [7]:
import os
import pandas as pd
from nbformat import read
from IPython.display import display
import json

def extract_data_from_notebooks(target_folders):
    data = []

    for folder in target_folders:
        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith(".ipynb"):
                    notebook_path = os.path.join(root, file)
                    try:
                        with open(notebook_path, "r", encoding="utf-8") as f:
                            notebook = read(f, as_version=4)
                            for cell in notebook.cells:
                                if cell.cell_type == "code":
                                    lines = cell.source.strip().split("\n")
                                    if lines and lines[0].startswith("# Date:"):
                                        date_part = lines[0].replace("# Date:", "").strip()
                                        details = "\n".join(lines[1:]).strip()
                                        status = None
                                        theme = None

                                        # Extract # Purpose:
                                        if "# Purpose:" in details:
                                            purpose_start = details.find("# Purpose:")
                                            details = details[purpose_start + len("# Purpose:"):].strip()

                                        # Extract # Status:
                                        if "# Status:" in details:
                                            status_start = details.find("# Status:")
                                            status_end = details.find('\n', status_start)
                                            if status_end == -1:
                                                status_end = len(details)
                                            status = details[status_start + len("# Status:"):status_end].strip()
                                            details = details.replace(details[status_start:status_end], "").strip()
                                            details = details.replace("# Status:", "").strip()
                                            details = details.replace("\n\n", "\n").strip()

                                        # Extract # Theme:
                                        if "# Theme:" in details:
                                            theme_start = details.find("# Theme:")
                                            theme_end = details.find('\n', theme_start)
                                            if theme_end == -1:
                                                theme_end = len(details)
                                            theme = details[theme_start + len("# Theme:"):theme_end].strip()
                                            details = details.replace(details[theme_start:theme_end], "").strip()
                                            details = details.replace("# Theme:", "").strip()

                                        data.append({
                                            "Date": date_part,
                                            "Status": status,
                                            "Theme": theme,
                                            "Details": details,
                                            "File Path": notebook_path,
                                        })
                                        break
                    except FileNotFoundError:
                        print(f"File not found: {notebook_path}")
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in {notebook_path}")
                    except Exception as e:
                        print(f"Error reading {notebook_path}: {e}")

    if not data:
        print("No matching data found.")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    return df


# Example usage
target_folders = [r"C:\users\Tim_S\Desktop\bt\AI"
                , r"C:\users\Tim_S\Desktop\bt\Master RAG"
                , r"C:\users\Tim_S\Desktop\bt\AIEng\llm_engineering"]
df = extract_data_from_notebooks(target_folders)

# -- Clean and organise
df['File Path'] = df['File Path'].str.replace(r"C:\\users\\Tim_S\\Desktop\\bt\\", "", regex=True)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', infer_datetime_format=True).dt.date
#df = df.sort_values(by='Date', ascending=False)
df = df.sort_values(by='Theme', ascending=False)
#df = df[df['Status']!='None']
df = df.dropna(subset=['Status'])
print('\n')
print('-'*25)
print('Number of notebooks:', df.shape)
print('-'*25)
print('\n')

# -- Style (was not left aligned)
if not df.empty:
    styled_df = df.style.set_properties(**{'text-align': 'left', 'white-space': 'nowrap'}) \
                        .set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}])
    display(styled_df)
else:
    print("No data found.")


  validate(nb)


Error reading C:\users\Tim_S\Desktop\bt\AIEng\llm_engineering\week1\day1.ipynb: Notebook does not appear to be JSON: '{\n "cells": [\n  {\n   "cell_type": "m...


-------------------------
Number of notebooks: (42, 5)
-------------------------




  df['Date'] = pd.to_datetime(df['Date'], errors='coerce', infer_datetime_format=True).dt.date
  df['Date'] = pd.to_datetime(df['Date'], errors='coerce', infer_datetime_format=True).dt.date


Unnamed: 0,Date,Status,Theme,Details,File Path
17,2025-01-03,Yet to play with,template,5 levels of text splitting (template),Master RAG\0.3 5_Levels_Of_Text_Splitting.ipynb
5,2024-11-30,Yet to start,Tools/function use,"Question LLM, have it make API call and return result",AI\3.0 api call_wip.ipynb
19,2025-02-23,Got it working.,Tools/function use,Tavilly,Master RAG\4.0 query.ipynb
8,2024-07-12,Yet to start,Tools/function use,Copy of example Google notebook (function calls on data).,AI\5.0 Function call.ipynb
50,2025-05-02,Complete,RAG,# Note: RAG & price determination on target item - pretty cool,AIEng\llm_engineering\week8\day2.3.ipynb
15,2025-01-03,Keep learning,Production code,Plan is to set up RAG using Class (to under Class better),Master RAG\0.1 Class_rag.ipynb
21,2025-01-18,Multiple,Multiple (Langchain + ui +),Langchain 'ConversationalRetrievalChain' (aka RAG) with Gradio UI,AIEng\llm_engineering\Random_t1s\1.0 langchan1.ipynb
22,2025-11-02,"Yes. With no data preprocessing, etc. the results look ok.",ML,Can i use the test data (dummy data) to train an RF model to predict income?,AIEng\llm_engineering\Random_t1s\2.0 W8day2.4_copy.ipynb
40,2025-01-22,Great notebook,ML,Works through tradtional ML (in more complex approaches) to predict item prices.,AIEng\llm_engineering\week6\day3.ipynb
24,2024-12-27,Complete,Local LLM + webscraping,Local LLM + webscraping,AIEng\llm_engineering\week1\day2 EXERCISE.ipynb
