In [None]:
# Concluding notes

print("Notebook complete. Figures saved under data/_docs/figures/. See the digest for commentary.")

In [None]:
# Performance profiling & micro-optimizations

import timeit
import cProfile

# Simple micro-benchmark example
data = list(range(10_0000))

def sum_loop(d):
    s = 0
    for x in d:
        s += x
    return s

print("timeit loop:", timeit.timeit(lambda: sum_loop(data), number=10))
print("timeit built-in sum:", timeit.timeit(lambda: sum(data), number=10))

# cProfile on a small function
cProfile.run('sum_loop(data)')

In [None]:
# Unit tests with pytest (example functions + tests)

def add(a, b):
    return a + b

# A trivial test; in real use you'd put these in tests/ and run `pytest`

def test_add():
    assert add(1, 2) == 3

print('Run `pytest -q` in the repo root to execute tests')

In [None]:
# Load repo digest and create targeted figures
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

key_lines = Path('data') / '_docs' / 'toe_key_lines.csv'
if not key_lines.exists():
    print('Warning: toe_key_lines.csv not found in data/_docs; run scripts/generate_toe_key_lines.py first')
else:
    df = pd.read_csv(key_lines, encoding='utf-8-sig')
    print('Loaded', len(df), 'rows from', key_lines)

    # Boxplot: mean_abs_delta by unique_k_mod6
    if 'mean_abs_delta' in df.columns and 'unique_k_mod6' in df.columns:
        plt.figure(figsize=(6,4))
        sns.boxplot(x='unique_k_mod6', y='mean_abs_delta', data=df)
        plt.title('mean_abs_delta by unique_k_mod6')
        out = Path('data') / '_docs' / 'figures' / 'mean_abs_delta_by_kmod6.png'
        plt.savefig(out, bbox_inches='tight')
        print('Saved', out)

    # Scatter: native_mean_abs_delta vs prior_score
    if 'native_mean_abs_delta' in df.columns and 'prior_score' in df.columns:
        plt.figure(figsize=(6,4))
        s = df.get('node_commutator_score', None)
        sizes = (s.fillna(s.mean())*200).clip(20,300) if s is not None else 40
        sns.scatterplot(x='native_mean_abs_delta', y='prior_score', hue=df.get('in_top_native', False), size=sizes, data=df)
        plt.title('prior_score vs native_mean_abs_delta')
        out = Path('data') / '_docs' / 'figures' / 'prior_vs_native_scatter.png'
        plt.savefig(out, bbox_inches='tight')
        print('Saved', out)

    # Histogram: k12_entropy
    if 'k12_entropy' in df.columns:
        plt.figure(figsize=(6,4))
        sns.histplot(df['k12_entropy'].dropna(), bins=30, kde=True)
        plt.title('k12_entropy distribution')
        out = Path('data') / '_docs' / 'figures' / 'k12_entropy_hist.png'
        plt.savefig(out, bbox_inches='tight')
        print('Saved', out)


In [None]:
# Saving & loading artifacts (joblib + CSV/Parquet)
from joblib import dump, load
from pathlib import Path

artifacts = Path('data') / '_workbench' / '00_meta' / 'artifacts'
artifacts.mkdir(parents=True, exist_ok=True)

obj = {'a': 1, 'b': 2}
dump(obj, artifacts / 'example_joblib_v1.joblib')
print('Saved joblib:', artifacts / 'example_joblib_v1.joblib')

# Save a tiny DataFrame
import pandas as pd
df = pd.DataFrame({'x':[1,2,3],'y':[4,5,6]})
df.to_csv(artifacts / 'example_df.csv', index=False)
print('Saved CSV:', artifacts / 'example_df.csv')

In [None]:
# Model evaluation & metrics (ROC, confusion matrix)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# quick example from synthetic classification earlier (will be available in the notebook)
try:
    y_true
    y_pred
    y_score
except NameError:
    # fallback small sample
    y_true = [0,1,0,1,1,0]
    y_pred = [0,1,0,1,0,0]
    y_score = [0.1, 0.9, 0.2, 0.8, 0.4, 0.3]

print('Accuracy', accuracy_score(y_true, y_pred))
print('Classification report:\n', classification_report(y_true, y_pred))

cm = confusion_matrix(y_true, y_pred)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion matrix')
plt.xlabel('Pred')
plt.ylabel('True')
plt.savefig('data/_docs/figures/example_confusion_matrix.png', bbox_inches='tight')
print('Saved confusion matrix example')

In [None]:
# Simple ML model: pipeline + cross-validation
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

X, y = make_classification(n_samples=500, n_features=10, n_informative=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=200))])
pipe.fit(X_train, y_train)
print('Train accuracy', pipe.score(X_train, y_train))
print('Test accuracy', pipe.score(X_test, y_test))
print('CV mean accuracy', cross_val_score(pipe, X, y, cv=5).mean())

In [None]:
# Data visualization examples (distributions, scatter, saving figures)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Small example dataset
rng = np.random.default_rng(0)
df = pd.DataFrame({'a': rng.normal(size=200), 'b': rng.normal(loc=2.0, scale=1.5, size=200)})

plt.figure()
sns.histplot(df['a'], kde=True)
plt.title('Distribution of a')
plt.savefig('data/_docs/figures/dist_a.png', bbox_inches='tight')
print('Saved dist_a.png')

plt.figure()
sns.scatterplot(x='a', y='b', data=df)
plt.title('Scatter a vs b')
plt.savefig('data/_docs/figures/scatter_a_b.png', bbox_inches='tight')
print('Saved scatter_a_b.png')

In [None]:
# Pandas DataFrame manipulation examples
import pandas as pd
import numpy as np

arr = np.arange(12).reshape(4,3)
df = pd.DataFrame(arr, columns=['x','y','z'])
df['sum'] = df.sum(axis=1)
print(df.head())
print('Groupby sum by z%2 ->')
print(df.groupby(df['z']%2)['sum'].sum())

# Handle missing values
df2 = df.copy()
df2.loc[1,'x'] = None
print('Fillna ->')
print(df2.fillna(df2.mean()))

# Export
df.to_csv('data/_workbench/00_meta/example_df_out.csv', index=False)
print('Exported example_df_out.csv')

In [None]:
# NumPy array operations
import numpy as np
arr = np.arange(12).reshape(3,4)
print('Array:\n', arr)
print('Sum axis 0:', arr.sum(axis=0))
print('Mean:', arr.mean())
print('Broadcast add:', arr + 1)
assert arr.shape == (3,4)


In [None]:
# Synthetic data generation (classification + regression)
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split

Xc, yc = make_classification(n_samples=300, n_features=8, n_informative=3, random_state=1)
Xr, yr = make_regression(n_samples=300, n_features=6, noise=0.1, random_state=2)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.25, random_state=42)
print('Classification shapes', Xc_train.shape, Xc_test.shape)
print('Regression shapes', Xr.shape, yr.shape)

In [None]:
# Setup & imports
# If a required package is missing, you can uncomment and run the pip install lines below.
# !pip install pandas matplotlib seaborn scikit-learn joblib pytest

import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML tools
from sklearn.model_selection import train_test_split

# Profiling & testing
import timeit
import cProfile

# Ensure figures directory exists
fig_dir = Path('data') / '_docs' / 'figures'
fig_dir.mkdir(parents=True, exist_ok=True)

print('Imports OK. Figures will be written to', fig_dir)
