# Synthetic Data Generation and Classifier Training

**Purpose**: Generate labeled disaster data using an LLM, embed it and train a classifier.

**Dependencies**: `openai`, `pandas`, `numpy`, `sklearn`


In [ ]:
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.linear_model import LogisticRegression
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

In [ ]:
def generate_synthetic(n=5):
    prompt = f"Generate {n} fake disaster news items labeled."
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(model="phi4-mini:latest", messages=messages)
    return pd.read_json(response.choices[0].message.content)

In [ ]:
def embed(text):
    response = client.embeddings.create(model="mxbai-embed-large", input=text)
    return np.array(response.data[0].embedding)

In [ ]:
df = generate_synthetic(8)
X = np.vstack([embed(t) for t in df['text']])
y = df['label']
clf = LogisticRegression().fit(X, y)
print('Classes:', clf.classes_)