# Model Training

## Objective
Train machine learning models to predict:
1. Request category
2. Request priority

## Approach
- Start with simple baselines (Logistic Regression, Naive Bayes)
- Try ensemble methods (Random Forest, XGBoost)
- Tune hyperparameters
- Save best models for deployment

In [None]:
# Imports
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import scipy.sparse

# Sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## 1. Load Processed Data

In [None]:
# Load features and labels from feature engineering
data_dir = Path('../data/processed')

X = scipy.sparse.load_npz(data_dir / 'X_tfidf.npz')
y_category = np.load(data_dir / 'y_category.npy')
y_priority = np.load(data_dir / 'y_priority.npy')

# Load encoders to decode labels later
with open(data_dir / 'category_encoder.pkl', 'rb') as f:
    category_encoder = pickle.load(f)
with open(data_dir / 'priority_encoder.pkl', 'rb') as f:
    priority_encoder = pickle.load(f)

print(f"Feature matrix shape: {X.shape}")
print(f"Category labels shape: {y_category.shape}")
print(f"Priority labels shape: {y_priority.shape}")
print(f"\nCategory classes: {category_encoder.classes_}")
print(f"Priority classes: {priority_encoder.classes_}")

## 2. Train/Test Split

In [None]:
# Split data (80% train, 20% test)
# TODO: Adjust test_size based on dataset size
RANDOM_STATE = 42

X_train, X_test, y_cat_train, y_cat_test, y_pri_train, y_pri_test = train_test_split(
    X, y_category, y_priority, test_size=0.2, random_state=RANDOM_STATE, stratify=y_category
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## 3. Category Prediction Model

### Baseline: Logistic Regression

In [None]:
# Train Logistic Regression
lr_cat = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE, class_weight='balanced')
lr_cat.fit(X_train, y_cat_train)

# Predictions
y_cat_pred_lr = lr_cat.predict(X_test)

# Evaluation
print("Logistic Regression - Category Prediction")
print(f"Accuracy: {accuracy_score(y_cat_test, y_cat_pred_lr):.3f}")
print("\nClassification Report:")
print(classification_report(y_cat_test, y_cat_pred_lr, target_names=category_encoder.classes_))

### Alternative: Naive Bayes

In [None]:
# TODO: Try Multinomial Naive Bayes
nb_cat = MultinomialNB()
nb_cat.fit(X_train, y_cat_train)

y_cat_pred_nb = nb_cat.predict(X_test)
print(f"Naive Bayes Accuracy: {accuracy_score(y_cat_test, y_cat_pred_nb):.3f}")

### Advanced: Random Forest

In [None]:
# TODO: Experiment with Random Forest or XGBoost
# Note: Random Forest on sparse data can be slow
# rf_cat = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
# rf_cat.fit(X_train, y_cat_train)
# y_cat_pred_rf = rf_cat.predict(X_test)
# print(f"Random Forest Accuracy: {accuracy_score(y_cat_test, y_cat_pred_rf):.3f}")

### Hyperparameter Tuning

In [None]:
# TODO: Tune hyperparameters if baseline performance is not sufficient
# param_grid = {
#     'C': [0.1, 1, 10],
#     'solver': ['lbfgs', 'saga']
# }
# grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train, y_cat_train)
# print(f"Best params: {grid_search.best_params_}")
# print(f"Best CV score: {grid_search.best_score_:.3f}")

## 4. Priority Prediction Model

### Logistic Regression for Priority

In [None]:
# Train Logistic Regression for priority
lr_pri = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE, class_weight='balanced')
lr_pri.fit(X_train, y_pri_train)

# Predictions
y_pri_pred_lr = lr_pri.predict(X_test)

# Evaluation
print("Logistic Regression - Priority Prediction")
print(f"Accuracy: {accuracy_score(y_pri_test, y_pri_pred_lr):.3f}")
print("\nClassification Report:")
print(classification_report(y_pri_test, y_pri_pred_lr, target_names=priority_encoder.classes_))

## 5. Model Selection

Based on performance, choose the best model for each task:

TODO: Fill in after experimentation
- **Category Model**: <!-- Logistic Regression / Naive Bayes / Random Forest -->
- **Priority Model**: <!-- Logistic Regression / etc. -->

**Rationale**: <!-- Higher accuracy, better F1-score, faster inference, etc. -->

## 6. Save Trained Models

In [None]:
# Create models directory
models_dir = Path('../models')
models_dir.mkdir(parents=True, exist_ok=True)

# Save best models (using Logistic Regression for now)
with open(models_dir / 'category_model.pkl', 'wb') as f:
    pickle.dump(lr_cat, f)
    
with open(models_dir / 'priority_model.pkl', 'wb') as f:
    pickle.dump(lr_pri, f)

print(f"Models saved to {models_dir}")

# Save model metadata
metadata = {
    'category_model': 'LogisticRegression',
    'category_accuracy': float(accuracy_score(y_cat_test, y_cat_pred_lr)),
    'priority_model': 'LogisticRegression',
    'priority_accuracy': float(accuracy_score(y_pri_test, y_pri_pred_lr)),
    'trained_date': pd.Timestamp.now().isoformat(),
    'n_train_samples': X_train.shape[0],
    'n_test_samples': X_test.shape[0]
}

import json
with open(models_dir / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("\nModel Metadata:")
print(json.dumps(metadata, indent=2))

## 7. Confidence Scores

Extract probability scores for predictions (used for confidence thresholding)

In [None]:
# Get probability predictions
y_cat_proba = lr_cat.predict_proba(X_test)
y_pri_proba = lr_pri.predict_proba(X_test)

# Max probability (confidence)
cat_confidence = y_cat_proba.max(axis=1)
pri_confidence = y_pri_proba.max(axis=1)

print(f"Category confidence - Mean: {cat_confidence.mean():.3f}, Min: {cat_confidence.min():.3f}, Max: {cat_confidence.max():.3f}")
print(f"Priority confidence - Mean: {pri_confidence.mean():.3f}, Min: {pri_confidence.min():.3f}, Max: {pri_confidence.max():.3f}")

# Plot confidence distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(cat_confidence, bins=30, edgecolor='black')
plt.xlabel('Confidence')
plt.ylabel('Frequency')
plt.title('Category Prediction Confidence')
plt.axvline(0.6, color='red', linestyle='--', label='Threshold=0.6')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(pri_confidence, bins=30, edgecolor='black')
plt.xlabel('Confidence')
plt.ylabel('Frequency')
plt.title('Priority Prediction Confidence')
plt.axvline(0.6, color='red', linestyle='--', label='Threshold=0.6')
plt.legend()

plt.tight_layout()
plt.show()

## 8. Next Steps

1. Proceed to `04-model-evaluation.ipynb` for detailed evaluation
2. Analyze confusion matrices, per-class performance
3. Identify error patterns
4. Create model card in `docs/MODEL-CARD.md`
5. Build prediction API in `api/app.py`

---
**References**:
- Output Schema: `contracts/integration-points/ds-model-output.schema.json`
- Priority Definitions: `contracts/data-models/priority-definitions.md`