# Train Alert Priority Classifier
This notebook trains a baseline TF-IDF + numeric feature classifier.

In [None]:

import os, json, joblib, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

DATA_PATH = "/mnt/data/structured_anonymous_tips_dataset.csv"
if not os.path.exists(DATA_PATH):
    # create a tiny synthetic sample if missing
    df = pd.DataFrame([
        {"tip_id":"T1","tip_summary_hints":"reads-messages;gps-tracking","incident_context":"workplace","alleged_abuser_relation":"manager",
         "timeline_consistency_score":0.8,"isolation_index":0.6,"financial_control_index":0.1,"digital_intrusion_index":0.8,
         "coercive_control_indicators":4,"gaslighting_phrases_detected":3,"threat_implicitness_score":0.5,"frequency_per_week":5,
         "duration_months":6,"prior_reports_count":0,"alert_priority":"high"},
        {"tip_id":"T2","tip_summary_hints":"verbal-abuse;humiliation","incident_context":"workplace","alleged_abuser_relation":"peer",
         "timeline_consistency_score":0.9,"isolation_index":0.2,"financial_control_index":0.0,"digital_intrusion_index":0.0,
         "coercive_control_indicators":2,"gaslighting_phrases_detected":2,"threat_implicitness_score":0.1,"frequency_per_week":2,
         "duration_months":3,"prior_reports_count":0,"alert_priority":"medium"}
    ])
    df.to_csv(DATA_PATH, index=False)

df = pd.read_csv(DATA_PATH)
df['text_field'] = (df.get('tip_summary_hints','').fillna('') + ' ' + df.get('incident_context','').fillna('') + ' ' + df.get('alleged_abuser_relation','').fillna(''))
df['alert_priority'] = df['alert_priority'].fillna('low').str.lower()

num_cols = ['timeline_consistency_score','isolation_index','financial_control_index','digital_intrusion_index','coercive_control_indicators','gaslighting_phrases_detected','threat_implicitness_score','frequency_per_week','duration_months','prior_reports_count']
for c in num_cols:
    if c not in df.columns: df[c]=0.0
df[num_cols] = df[num_cols].fillna(0.0).astype(float)
df['numeric_array'] = df[num_cols].values.tolist()

X = df[['text_field','numeric_array']]; y = df['alert_priority']
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if len(y.unique())>1 else None)

text_pipe = Pipeline([('extract', FunctionTransformer(lambda d: d['text_field'].astype(str), validate=False)),
                      ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2)))])
num_pipe = Pipeline([('extract', FunctionTransformer(lambda d: np.vstack(d['numeric_array']).astype(float), validate=False))])
feats = FeatureUnion([('text', text_pipe), ('num', num_pipe)])
model = Pipeline([('features', feats), ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))])

model.fit(Xtr, ytr)
preds = model.predict(Xte) if len(df) > 2 else ytr
print("Accuracy (approx):", accuracy_score(yte if len(df)>2 else ytr, preds))
print(classification_report(yte if len(df)>2 else ytr, preds))
