# Synthetic Data Generation and Classifier Training

**Purpose**: Generate labeled disaster data using an LLM, embed it and train a classifier.

**Dependencies**: `requests`, `pandas`, `numpy`, `sklearn`


In [ ]:
import pandas as pd
import numpy as np
import requests
from sklearn.linear_model import LogisticRegression

simulate = True

In [ ]:
def generate_synthetic(n=5):
    if simulate:
        types = ['earthquake','flood','wildfire','other']
        data = []
        for i in range(n):
            label = types[i % 4]
            data.append({'text': f'sample {label} report', 'label': label})
        return pd.DataFrame(data)
    prompt = f'Generate {n} fake disaster news items labeled.'
    payload = {'model': 'llama3', 'prompt': prompt}
    r = requests.post('http://localhost:11434/api/generate', json=payload)
    r.raise_for_status()
    # parse JSON here
    return pd.read_json(r.json()['response'])

In [ ]:
def embed(text):
    if simulate:
        return np.random.rand(3)
    payload = {'model': 'mxbai-embed-large', 'prompt': text}
    r = requests.post('http://localhost:11434/api/embeddings', json=payload)
    r.raise_for_status()
    return np.array(r.json()['embedding'])

In [ ]:
df = generate_synthetic(8)
X = np.vstack([embed(t) for t in df['text']])
y = df['label']
clf = LogisticRegression().fit(X, y)
print('Classes:', clf.classes_)